From e06fa9c16ce4b740996189fa5610eabcee734e6c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 Aug 2018 22:08:50 +0200 Subject: bpf, sockmap: fix potential use after free in bpf_tcp_close bpf_tcp_close() we pop the psock linkage to a map via psock_map_pop(). A parallel update on the sock hash map can happen between psock_map_pop() and lookup_elem_raw() where we override the element under link->hash / link->key. In bpf_tcp_close()'s lookup_elem_raw() we subsequently only test whether an element is present, but we do not test whether the element is infact the element we were looking for. We lock the sock in bpf_tcp_close() during that time, so do we hold the lock in sock_hash_update_elem(). However, the latter locks the sock which is newly updated, not the one we're purging from the hash table. This means that while one CPU is doing the lookup from bpf_tcp_close(), another CPU is doing the map update in parallel, dropped our sock from the hlist and released the psock. Subsequently the first CPU will find the new sock and attempts to drop and release the old sock yet another time. Fix is that we need to check the elements for a match after lookup, similar as we do in the sock map. Note that the hash tab elems are freed via RCU, so access to their link->hash / link->key is fine since we're under RCU read side there. Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf5195c7c331..01879e4d599a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -369,7 +369,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) /* If another thread deleted this object skip deletion. * The refcnt on psock may or may not be zero. */ - if (l) { + if (l && l == link) { hlist_del_rcu(&link->hash_node); smap_release_sock(psock, link->sk); free_htab_elem(htab, link); -- cgit v1.2.3 From 15c480efab01197c965ce0562a43ffedd852b8f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 Aug 2018 22:08:51 +0200 Subject: bpf, sockmap: fix psock refcount leak in bpf_tcp_recvmsg In bpf_tcp_recvmsg() we first took a reference on the psock, however once we find that there are skbs in the normal socket's receive queue we return with processing them through tcp_recvmsg(). Problem is that we leak the taken reference on the psock in that path. Given we don't really do anything with the psock at this point, move the skb_queue_empty() test before we fetch the psock to fix this case. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 01879e4d599a..26d8a3053407 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -912,6 +912,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); rcu_read_lock(); psock = smap_psock_sk(sk); @@ -922,9 +924,6 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out; rcu_read_unlock(); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); bytes_ready: while (copied != len) { -- cgit v1.2.3 From 501ca81760c204ec59b73e4a00bee5971fc0f1b1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 24 Aug 2018 17:37:00 -0700 Subject: bpf: sockmap, decrement copied count correctly in redirect error case Currently, when a redirect occurs in sockmap and an error occurs in the redirect call we unwind the scatterlist once in the error path of bpf_tcp_sendmsg_do_redirect() and then again in sendmsg(). Then in the error path of sendmsg we decrement the copied count by the send size. However, its possible we partially sent data before the error was generated. This can happen if do_tcp_sendpages() partially sends the scatterlist before encountering a memory pressure error. If this happens we need to decrement the copied value (the value tracking how many bytes were actually sent to TCP stack) by the number of remaining bytes _not_ the entire send size. Otherwise we risk confusing userspace. Also we don't need two calls to free the scatterlist one is good enough. So remove the one in bpf_tcp_sendmsg_do_redirect() and then properly reduce copied by the number of remaining bytes which may in fact be the entire send size if no bytes were sent. To do this use bool to indicate if free_start_sg() should do mem accounting or not. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 26d8a3053407..ce63e5801746 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -236,7 +236,7 @@ static int bpf_tcp_init(struct sock *sk) } static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); static void bpf_tcp_release(struct sock *sk) { @@ -248,7 +248,7 @@ static void bpf_tcp_release(struct sock *sk) goto out; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } @@ -330,14 +330,14 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); psock->cork = NULL; } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } @@ -570,14 +570,16 @@ static void free_bytes_sg(struct sock *sk, int bytes, md->sg_start = i; } -static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +static int free_sg(struct sock *sk, int start, + struct sk_msg_buff *md, bool charge) { struct scatterlist *sg = md->sg_data; int i = start, free = 0; while (sg[i].length) { free += sg[i].length; - sk_mem_uncharge(sk, sg[i].length); + if (charge) + sk_mem_uncharge(sk, sg[i].length); if (!md->skb) put_page(sg_page(&sg[i])); sg[i].length = 0; @@ -594,9 +596,9 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) return free; } -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) { - int free = free_sg(sk, md->sg_start, md); + int free = free_sg(sk, md->sg_start, md, charge); md->sg_start = md->sg_end; return free; @@ -604,7 +606,7 @@ static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) { - return free_sg(sk, md->sg_curr, md); + return free_sg(sk, md->sg_curr, md, true); } static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) @@ -718,7 +720,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, list_add_tail(&r->list, &psock->ingress); sk->sk_data_ready(sk); } else { - free_start_sg(sk, r); + free_start_sg(sk, r, true); kfree(r); } @@ -752,14 +754,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, release_sock(sk); } smap_release_sock(psock, sk); - if (unlikely(err)) - goto out; - return 0; + return err; out_rcu: rcu_read_unlock(); -out: - free_bytes_sg(NULL, send, md, false); - return err; + return 0; } static inline void bpf_md_init(struct smap_psock *psock) @@ -822,7 +820,7 @@ more_data: case __SK_PASS: err = bpf_tcp_push(sk, send, m, flags, true); if (unlikely(err)) { - *copied -= free_start_sg(sk, m); + *copied -= free_start_sg(sk, m, true); break; } @@ -845,16 +843,17 @@ more_data: lock_sock(sk); if (unlikely(err < 0)) { - free_start_sg(sk, m); + int free = free_start_sg(sk, m, false); + psock->sg_size = 0; if (!cork) - *copied -= send; + *copied -= free; } else { psock->sg_size -= send; } if (cork) { - free_start_sg(sk, m); + free_start_sg(sk, m, true); psock->sg_size = 0; kfree(m); m = NULL; @@ -1121,7 +1120,7 @@ wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (m && m != psock->cork) - free_start_sg(sk, m); + free_start_sg(sk, m, true); goto out_err; } } @@ -1580,13 +1579,13 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_tx_msg); if (psock->cork) { - free_start_sg(psock->sock, psock->cork); + free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); } list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); - free_start_sg(psock->sock, md); + free_start_sg(psock->sock, md, true); kfree(md); } -- cgit v1.2.3 From 597222f72a94118f593e4f32bf58ae7e049a0df1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 30 Aug 2018 21:25:02 -0700 Subject: bpf: avoid misuse of psock when TCP_ULP_BPF collides with another ULP Currently we check sk_user_data is non NULL to determine if the sk exists in a map. However, this is not sufficient to ensure the psock or the ULP ops are not in use by another user, such as kcm or TLS. To avoid this when adding a sock to a map also verify it is of the correct ULP type. Additionally, when releasing a psock verify that it is the TCP_ULP_BPF type before releasing the ULP. The error case where we abort an update due to ULP collision can cause this error path. For example, __sock_map_ctx_update_elem() [...] err = tcp_set_ulp_id(sock, TCP_ULP_BPF) <- collides with TLS if (err) <- so err out here goto out_free [...] out_free: smap_release_sock() <- calling tcp_cleanup_ulp releases the TLS ULP incorrectly. Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index ce63e5801746..488ef9663c01 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1462,10 +1462,16 @@ static void smap_destroy_psock(struct rcu_head *rcu) schedule_work(&psock->gc_work); } +static bool psock_is_smap_sk(struct sock *sk) +{ + return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; +} + static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { - tcp_cleanup_ulp(sock); + if (psock_is_smap_sk(sock)) + tcp_cleanup_ulp(sock); write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); write_unlock_bh(&sock->sk_callback_lock); @@ -1892,6 +1898,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, * doesn't update user data. */ if (psock) { + if (!psock_is_smap_sk(sock)) { + err = -EBUSY; + goto out_progs; + } if (READ_ONCE(psock->bpf_parse) && parse) { err = -EBUSY; goto out_progs; -- cgit v1.2.3