From ef8d8ccdc216f797e66cb4a1372f5c4c285ce1e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 16 Aug 2019 21:26:22 -0700
Subject: tcp: make sure EPOLLOUT wont be missed

As Jason Baron explained in commit 790ba4566c1a ("tcp: set SOCK_NOSPACE
under memory pressure"), it is crucial we properly set SOCK_NOSPACE
when needed.

However, Jason patch had a bug, because the 'nonblocking' status
as far as sk_stream_wait_memory() is concerned is governed
by MSG_DONTWAIT flag passed at sendmsg() time :

    long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);

So it is very possible that tcp sendmsg() calls sk_stream_wait_memory(),
and that sk_stream_wait_memory() returns -EAGAIN with SOCK_NOSPACE
cleared, if sk->sk_sndtimeo has been set to a small (but not zero)
value.

This patch removes the 'noblock' variable since we must always
set SOCK_NOSPACE if -EAGAIN is returned.

It also renames the do_nonblock label since we might reach this
code path even if we were in blocking mode.

Fixes: 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jason Baron <jbaron@akamai.com>
Reported-by: Vladimir Rutsky  <rutsky@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/stream.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/stream.c b/net/core/stream.c
index e94bb02a5629..4f1d4aa5fb38 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -120,7 +120,6 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 	int err = 0;
 	long vm_wait = 0;
 	long current_timeo = *timeo_p;
-	bool noblock = (*timeo_p ? false : true);
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
 	if (sk_stream_memory_free(sk))
@@ -133,11 +132,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 
 		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 			goto do_error;
-		if (!*timeo_p) {
-			if (noblock)
-				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-			goto do_nonblock;
-		}
+		if (!*timeo_p)
+			goto do_eagain;
 		if (signal_pending(current))
 			goto do_interrupted;
 		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -169,7 +165,13 @@ out:
 do_error:
 	err = -EPIPE;
 	goto out;
-do_nonblock:
+do_eagain:
+	/* Make sure that whenever EAGAIN is returned, EPOLLOUT event can
+	 * be generated later.
+	 * When TCP receives ACK packets that make room, tcp_check_space()
+	 * only calls tcp_new_space() if SOCK_NOSPACE is set.
+	 */
+	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 	err = -EAGAIN;
 	goto out;
 do_interrupted:
-- 
cgit v1.2.3


From db38de39684dda2bf307f41797db2831deba64e9 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 21 Aug 2019 14:17:20 +0200
Subject: flow_dissector: Fix potential use-after-free on BPF_PROG_DETACH

Call to bpf_prog_put(), with help of call_rcu(), queues an RCU-callback to
free the program once a grace period has elapsed. The callback can run
together with new RCU readers that started after the last grace period.
New RCU readers can potentially see the "old" to-be-freed or already-freed
pointer to the program object before the RCU update-side NULLs it.

Reorder the operations so that the RCU update-side resets the protected
pointer before the end of the grace period after which the program will be
freed.

Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook")
Reported-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: Petar Penkov <ppenkov@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/flow_dissector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3e6fedb57bc1..2470b4b404e6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -142,8 +142,8 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
 		mutex_unlock(&flow_dissector_mutex);
 		return -ENOENT;
 	}
-	bpf_prog_put(attached);
 	RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
+	bpf_prog_put(attached);
 	mutex_unlock(&flow_dissector_mutex);
 	return 0;
 }
-- 
cgit v1.2.3


From 2c238177bd7f4b14bdf7447cc1cd9bb791f147e6 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Tue, 20 Aug 2019 17:50:25 +0200
Subject: bpf: allow narrow loads of some sk_reuseport_md fields with offset >
 0

test_select_reuseport fails on s390 due to verifier rejecting
test_select_reuseport_kern.o with the following message:

	; data_check.eth_protocol = reuse_md->eth_protocol;
	18: (69) r1 = *(u16 *)(r6 +22)
	invalid bpf_context access off=22 size=2

This is because on big-endian machines casts from __u32 to __u16 are
generated by referencing the respective variable as __u16 with an offset
of 2 (as opposed to 0 on little-endian machines).

The verifier already has all the infrastructure in place to allow such
accesses, it's just that they are not explicitly enabled for
eth_protocol field. Enable them for eth_protocol field by using
bpf_ctx_range instead of offsetof.

Ditto for ip_protocol, bind_inany and len, since they already allow
narrowing, and the same problem can arise when working with them.

Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 7878f918b8c0..4c6a252d4212 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8757,13 +8757,13 @@ sk_reuseport_is_valid_access(int off, int size,
 		return size == size_default;
 
 	/* Fields that allow narrowing */
-	case offsetof(struct sk_reuseport_md, eth_protocol):
+	case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
 		if (size < FIELD_SIZEOF(struct sk_buff, protocol))
 			return false;
 		/* fall through */
-	case offsetof(struct sk_reuseport_md, ip_protocol):
-	case offsetof(struct sk_reuseport_md, bind_inany):
-	case offsetof(struct sk_reuseport_md, len):
+	case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
+	case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
+	case bpf_ctx_range(struct sk_reuseport_md, len):
 		bpf_ctx_record_field_size(info, size_default);
 		return bpf_ctx_narrow_access_ok(off, size, size_default);
 
-- 
cgit v1.2.3


From b45ce32135d1c82a5bf12aa56957c3fd27956057 Mon Sep 17 00:00:00 2001
From: zhanglin <zhang.lin16@zte.com.cn>
Date: Fri, 23 Aug 2019 09:14:11 +0800
Subject: sock: fix potential memory leak in proto_register()

If protocols registered exceeded PROTO_INUSE_NR, prot will be
added to proto_list, but no available bit left for prot in
proto_inuse_idx.

Changes since v2:
* Propagate the error code properly

Signed-off-by: zhanglin <zhang.lin16@zte.com.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 6d08553f885c..545fac19a711 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3287,16 +3287,17 @@ static __init int net_inuse_init(void)
 
 core_initcall(net_inuse_init);
 
-static void assign_proto_idx(struct proto *prot)
+static int assign_proto_idx(struct proto *prot)
 {
 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
 
 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
 		pr_err("PROTO_INUSE_NR exhausted\n");
-		return;
+		return -ENOSPC;
 	}
 
 	set_bit(prot->inuse_idx, proto_inuse_idx);
+	return 0;
 }
 
 static void release_proto_idx(struct proto *prot)
@@ -3305,8 +3306,9 @@ static void release_proto_idx(struct proto *prot)
 		clear_bit(prot->inuse_idx, proto_inuse_idx);
 }
 #else
-static inline void assign_proto_idx(struct proto *prot)
+static inline int assign_proto_idx(struct proto *prot)
 {
+	return 0;
 }
 
 static inline void release_proto_idx(struct proto *prot)
@@ -3355,6 +3357,8 @@ static int req_prot_init(const struct proto *prot)
 
 int proto_register(struct proto *prot, int alloc_slab)
 {
+	int ret = -ENOBUFS;
+
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create_usercopy(prot->name,
 					prot->obj_size, 0,
@@ -3391,20 +3395,27 @@ int proto_register(struct proto *prot, int alloc_slab)
 	}
 
 	mutex_lock(&proto_list_mutex);
+	ret = assign_proto_idx(prot);
+	if (ret) {
+		mutex_unlock(&proto_list_mutex);
+		goto out_free_timewait_sock_slab_name;
+	}
 	list_add(&prot->node, &proto_list);
-	assign_proto_idx(prot);
 	mutex_unlock(&proto_list_mutex);
-	return 0;
+	return ret;
 
 out_free_timewait_sock_slab_name:
-	kfree(prot->twsk_prot->twsk_slab_name);
+	if (alloc_slab && prot->twsk_prot)
+		kfree(prot->twsk_prot->twsk_slab_name);
 out_free_request_sock_slab:
-	req_prot_cleanup(prot->rsk_prot);
+	if (alloc_slab) {
+		req_prot_cleanup(prot->rsk_prot);
 
-	kmem_cache_destroy(prot->slab);
-	prot->slab = NULL;
+		kmem_cache_destroy(prot->slab);
+		prot->slab = NULL;
+	}
 out:
-	return -ENOBUFS;
+	return ret;
 }
 EXPORT_SYMBOL(proto_register);
 
-- 
cgit v1.2.3