From 40a34121ac1dc52ed9cd34a8f4e48e32517a52fd Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:32 -0700
Subject: bpf, sockmap: Use stricter sk state checks in sk_lookup_assign

In order to fix an issue with sockets in TCP sockmap redirect cases we plan
to allow CLOSE state sockets to exist in the sockmap. However, the check in
bpf_sk_lookup_assign() currently only invalidates sockets in the
TCP_ESTABLISHED case relying on the checks on sockmap insert to ensure we
never SOCK_CLOSE state sockets in the map.

To prepare for this change we flip the logic in bpf_sk_lookup_assign() to
explicitly test for the accepted cases. Namely, a tcp socket in TCP_LISTEN
or a udp socket in TCP_CLOSE state. This also makes the code more resilent
to future changes.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-2-john.fastabend@gmail.com
---
 net/core/filter.c   | 6 ++++--
 net/core/sock_map.c | 6 ------
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 8e8d3b49c297..a68418268e92 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10423,8 +10423,10 @@ BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
 		return -EINVAL;
 	if (unlikely(sk && sk_is_refcounted(sk)))
 		return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
-	if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED))
-		return -ESOCKTNOSUPPORT; /* reject connected sockets */
+	if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
+		return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
+	if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
+		return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */
 
 	/* Check if socket is suitable for packet L3/L4 protocol */
 	if (sk && sk->sk_protocol != ctx->protocol)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index e252b8ec2b85..f39ef79ced67 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -511,12 +511,6 @@ static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
 	       ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB;
 }
 
-static bool sk_is_tcp(const struct sock *sk)
-{
-	return sk->sk_type == SOCK_STREAM &&
-	       sk->sk_protocol == IPPROTO_TCP;
-}
-
 static bool sock_map_redirect_allowed(const struct sock *sk)
 {
 	if (sk_is_tcp(sk))
-- 
cgit v1.2.3


From b8b8315e39ffaca82e79d86dde26e9144addf66b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:33 -0700
Subject: bpf, sockmap: Remove unhash handler for BPF sockmap usage

We do not need to handle unhash from BPF side we can simply wait for the
close to happen. The original concern was a socket could transition from
ESTABLISHED state to a new state while the BPF hook was still attached.
But, we convinced ourself this is no longer possible and we also improved
BPF sockmap to handle listen sockets so this is no longer a problem.

More importantly though there are cases where unhash is called when data is
in the receive queue. The BPF unhash logic will flush this data which is
wrong. To be correct it should keep the data in the receive queue and allow
a receiving application to continue reading the data. This may happen when
tcp_abort() is received for example. Instead of complicating the logic in
unhash simply moving all this to tcp_close() hook solves this.

Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Jussi Maki <joamaki@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-3-john.fastabend@gmail.com
---
 net/ipv4/tcp_bpf.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 5f4d6f45d87f..246f725b78c9 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -475,7 +475,6 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 				   struct proto *base)
 {
 	prot[TCP_BPF_BASE]			= *base;
-	prot[TCP_BPF_BASE].unhash		= sock_map_unhash;
 	prot[TCP_BPF_BASE].close		= sock_map_close;
 	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
 	prot[TCP_BPF_BASE].sock_is_readable	= sk_msg_is_readable;
-- 
cgit v1.2.3


From c5d2177a72a1659554922728fc407f59950aa929 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:34 -0700
Subject: bpf, sockmap: Fix race in ingress receive verdict with redirect to
 self

A socket in a sockmap may have different combinations of programs attached
depending on configuration. There can be no programs in which case the socket
acts as a sink only. There can be a TX program in this case a BPF program is
attached to sending side, but no RX program is attached. There can be an RX
program only where sends have no BPF program attached, but receives are hooked
with BPF. And finally, both TX and RX programs may be attached. Giving us the
permutations:

 None, Tx, Rx, and TxRx

To date most of our use cases have been TX case being used as a fast datapath
to directly copy between local application and a userspace proxy. Or Rx cases
and TxRX applications that are operating an in kernel based proxy. The traffic
in the first case where we hook applications into a userspace application looks
like this:

  AppA  redirect   AppB
   Tx <-----------> Rx
   |                |
   +                +
   TCP <--> lo <--> TCP

In this case all traffic from AppA (after 3whs) is copied into the AppB
ingress queue and no traffic is ever on the TCP recieive_queue.

In the second case the application never receives, except in some rare error
cases, traffic on the actual user space socket. Instead the send happens in
the kernel.

           AppProxy       socket pool
       sk0 ------------->{sk1,sk2, skn}
        ^                      |
        |                      |
        |                      v
       ingress              lb egress
       TCP                  TCP

Here because traffic is never read off the socket with userspace recv() APIs
there is only ever one reader on the sk receive_queue. Namely the BPF programs.

However, we've started to introduce a third configuration where the BPF program
on receive should process the data, but then the normal case is to push the
data into the receive queue of AppB.

       AppB
       recv()                (userspace)
     -----------------------
       tcp_bpf_recvmsg()     (kernel)
         |             |
         |             |
         |             |
       ingress_msgQ    |
         |             |
       RX_BPF          |
         |             |
         v             v
       sk->receive_queue

This is different from the App{A,B} redirect because traffic is first received
on the sk->receive_queue.

Now for the issue. The tcp_bpf_recvmsg() handler first checks the ingress_msg
queue for any data handled by the BPF rx program and returned with PASS code
so that it was enqueued on the ingress msg queue. Then if no data exists on
that queue it checks the socket receive queue. Unfortunately, this is the same
receive_queue the BPF program is reading data off of. So we get a race. Its
possible for the recvmsg() hook to pull data off the receive_queue before the
BPF hook has a chance to read it. It typically happens when an application is
banging on recv() and getting EAGAINs. Until they manage to race with the RX
BPF program.

To fix this we note that before this patch at attach time when the socket is
loaded into the map we check if it needs a TX program or just the base set of
proto bpf hooks. Then it uses the above general RX hook regardless of if we
have a BPF program attached at rx or not. This patch now extends this check to
handle all cases enumerated above, TX, RX, TXRX, and none. And to fix above
race when an RX program is attached we use a new hook that is nearly identical
to the old one except now we do not let the recv() call skip the RX BPF program.
Now only the BPF program pulls data from sk->receive_queue and recv() only
pulls data from the ingress msgQ post BPF program handling.

With this resolved our AppB from above has been up and running for many hours
without detecting any errors. We do this by correlating counters in RX BPF
events and the AppB to ensure data is never skipping the BPF program. Selftests,
was not able to detect this because we only run them for a short period of time
on well ordered send/recvs so we don't get any of the noise we see in real
application environments.

Fixes: 51199405f9672 ("bpf: skb_verdict, support SK_PASS on RX BPF path")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Jussi Maki <joamaki@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-4-john.fastabend@gmail.com
---
 net/ipv4/tcp_bpf.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 246f725b78c9..f70aa0932bd6 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -172,6 +172,41 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
 	return ret;
 }
 
+static int tcp_bpf_recvmsg_parser(struct sock *sk,
+				  struct msghdr *msg,
+				  size_t len,
+				  int nonblock,
+				  int flags,
+				  int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+	lock_sock(sk);
+msg_bytes_ready:
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	if (!copied) {
+		long timeo;
+		int data;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = tcp_msg_wait_data(sk, psock, timeo);
+		if (data && !sk_psock_queue_empty(psock))
+			goto msg_bytes_ready;
+		copied = -EAGAIN;
+	}
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return copied;
+}
+
 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len)
 {
@@ -464,6 +499,8 @@ enum {
 enum {
 	TCP_BPF_BASE,
 	TCP_BPF_TX,
+	TCP_BPF_RX,
+	TCP_BPF_TXRX,
 	TCP_BPF_NUM_CFGS,
 };
 
@@ -482,6 +519,12 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
 	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
 	prot[TCP_BPF_TX].sendpage		= tcp_bpf_sendpage;
+
+	prot[TCP_BPF_RX]			= prot[TCP_BPF_BASE];
+	prot[TCP_BPF_RX].recvmsg		= tcp_bpf_recvmsg_parser;
+
+	prot[TCP_BPF_TXRX]			= prot[TCP_BPF_TX];
+	prot[TCP_BPF_TXRX].recvmsg		= tcp_bpf_recvmsg_parser;
 }
 
 static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
@@ -519,6 +562,10 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
 	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
 
+	if (psock->progs.stream_verdict || psock->progs.skb_verdict) {
+		config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX;
+	}
+
 	if (restore) {
 		if (inet_csk_has_ulp(sk)) {
 			/* TLS does not have an unhash proto in SW cases,
-- 
cgit v1.2.3


From e0dc3b93bd7bcff8c3813d1df43e0908499c7cf0 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 3 Nov 2021 13:47:35 -0700
Subject: bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and
 colliding

Strparser is reusing the qdisc_skb_cb struct to stash the skb message handling
progress, e.g. offset and length of the skb. First this is poorly named and
inherits a struct from qdisc that doesn't reflect the actual usage of cb[] at
this layer.

But, more importantly strparser is using the following to access its metadata.

  (struct _strp_msg *)((void *)skb->cb + offsetof(struct qdisc_skb_cb, data))

Where _strp_msg is defined as:

  struct _strp_msg {
        struct strp_msg            strp;                 /*     0     8 */
        int                        accum_len;            /*     8     4 */

        /* size: 12, cachelines: 1, members: 2 */
        /* last cacheline: 12 bytes */
  };

So we use 12 bytes of ->data[] in struct. However in BPF code running parser
and verdict the user has read capabilities into the data[] array as well. Its
not too problematic, but we should not be exposing internal state to BPF
program. If its really needed then we can use the probe_read() APIs which allow
reading kernel memory. And I don't believe cb[] layer poses any API breakage by
moving this around because programs can't depend on cb[] across layers.

In order to fix another issue with a ctx rewrite we need to stash a temp
variable somewhere. To make this work cleanly this patch builds a cb struct
for sk_skb types called sk_skb_cb struct. Then we can use this consistently
in the strparser, sockmap space. Additionally we can start allowing ->cb[]
write access after this.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Jussi Maki <joamaki@gmail.com>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-5-john.fastabend@gmail.com
---
 net/core/filter.c         | 22 ++++++++++++++++++++++
 net/strparser/strparser.c | 10 +---------
 2 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index a68418268e92..c3936d0724b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9782,11 +9782,33 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
 	switch (si->off) {
 	case offsetof(struct __sk_buff, data_end):
 		insn = bpf_convert_data_end_access(si, insn);
 		break;
+	case offsetof(struct __sk_buff, cb[0]) ...
+	     offsetofend(struct __sk_buff, cb[4]) - 1:
+		BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
+		BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+			      offsetof(struct sk_skb_cb, data)) %
+			     sizeof(__u64));
+
+		prog->cb_access = 1;
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, cb[0]);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct sk_skb_cb, data);
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
+					      si->src_reg, off);
+		else
+			*insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+					      si->src_reg, off);
+		break;
+
+
 	default:
 		return bpf_convert_ctx_access(type, si, insn_buf, prog,
 					      target_size);
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 9c0343568d2a..1a72c67afed5 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -27,18 +27,10 @@
 
 static struct workqueue_struct *strp_wq;
 
-struct _strp_msg {
-	/* Internal cb structure. struct strp_msg must be first for passing
-	 * to upper layer.
-	 */
-	struct strp_msg strp;
-	int accum_len;
-};
-
 static inline struct _strp_msg *_strp_msg(struct sk_buff *skb)
 {
 	return (struct _strp_msg *)((void *)skb->cb +
-		offsetof(struct qdisc_skb_cb, data));
+		offsetof(struct sk_skb_cb, strp));
 }
 
 /* Lower lock held */
-- 
cgit v1.2.3


From b2c4618162ec615a15883a804cce7e27afecfa58 Mon Sep 17 00:00:00 2001
From: Jussi Maki <joamaki@gmail.com>
Date: Wed, 3 Nov 2021 13:47:36 -0700
Subject: bpf, sockmap: sk_skb data_end access incorrect when src_reg = dst_reg

The current conversion of skb->data_end reads like this:

  ; data_end = (void*)(long)skb->data_end;
   559: (79) r1 = *(u64 *)(r2 +200)   ; r1  = skb->data
   560: (61) r11 = *(u32 *)(r2 +112)  ; r11 = skb->len
   561: (0f) r1 += r11
   562: (61) r11 = *(u32 *)(r2 +116)
   563: (1f) r1 -= r11

But similar to the case in 84f44df664e9 ("bpf: sock_ops sk access may stomp
registers when dst_reg = src_reg"), the code will read an incorrect skb->len
when src == dst. In this case we end up generating this xlated code:

  ; data_end = (void*)(long)skb->data_end;
   559: (79) r1 = *(u64 *)(r1 +200)   ; r1  = skb->data
   560: (61) r11 = *(u32 *)(r1 +112)  ; r11 = (skb->data)->len
   561: (0f) r1 += r11
   562: (61) r11 = *(u32 *)(r1 +116)
   563: (1f) r1 -= r11

... where line 560 is the reading 4B of (skb->data + 112) instead of the
intended skb->len Here the skb pointer in r1 gets set to skb->data and the
later deref for skb->len ends up following skb->data instead of skb.

This fixes the issue similarly to the patch mentioned above by creating an
additional temporary variable and using to store the register when dst_reg =
src_reg. We name the variable bpf_temp_reg and place it in the cb context for
sk_skb. Then we restore from the temp to ensure nothing is lost.

Fixes: 16137b09a66f2 ("bpf: Compute data_end dynamically with JIT code")
Signed-off-by: Jussi Maki <joamaki@gmail.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/20211103204736.248403-6-john.fastabend@gmail.com
---
 net/core/filter.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index c3936d0724b8..e471c9b09670 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9756,22 +9756,46 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
 						    struct bpf_insn *insn)
 {
-	/* si->dst_reg = skb->data */
+	int reg;
+	int temp_reg_off = offsetof(struct sk_buff, cb) +
+			   offsetof(struct sk_skb_cb, temp_reg);
+
+	if (si->src_reg == si->dst_reg) {
+		/* We need an extra register, choose and save a register. */
+		reg = BPF_REG_9;
+		if (si->src_reg == reg || si->dst_reg == reg)
+			reg--;
+		if (si->src_reg == reg || si->dst_reg == reg)
+			reg--;
+		*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
+	} else {
+		reg = si->dst_reg;
+	}
+
+	/* reg = skb->data */
 	*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
-			      si->dst_reg, si->src_reg,
+			      reg, si->src_reg,
 			      offsetof(struct sk_buff, data));
 	/* AX = skb->len */
 	*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
 			      BPF_REG_AX, si->src_reg,
 			      offsetof(struct sk_buff, len));
-	/* si->dst_reg = skb->data + skb->len */
-	*insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+	/* reg = skb->data + skb->len */
+	*insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
 	/* AX = skb->data_len */
 	*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
 			      BPF_REG_AX, si->src_reg,
 			      offsetof(struct sk_buff, data_len));
-	/* si->dst_reg = skb->data + skb->len - skb->data_len */
-	*insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
+
+	/* reg = skb->data + skb->len - skb->data_len */
+	*insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);
+
+	if (si->src_reg == si->dst_reg) {
+		/* Restore the saved register */
+		*insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
+		*insn++ = BPF_MOV64_REG(si->dst_reg, reg);
+		*insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
+	}
 
 	return insn;
 }
-- 
cgit v1.2.3