From 93dd5f185916b05e931cffae636596f21f98546e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 25 Jun 2020 16:12:59 -0700 Subject: bpf, sockmap: RCU splat with redirect and strparser error or TLS There are two paths to generate the below RCU splat the first and most obvious is the result of the BPF verdict program issuing a redirect on a TLS socket (This is the splat shown below). Unlike the non-TLS case the caller of the *strp_read() hooks does not wrap the call in a rcu_read_lock/unlock. Then if the BPF program issues a redirect action we hit the RCU splat. However, in the non-TLS socket case the splat appears to be relatively rare, because the skmsg caller into the strp_data_ready() is wrapped in a rcu_read_lock/unlock. Shown here, static void sk_psock_strp_data_ready(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { psock->parser.saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->parser.strp); write_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); } If the above was the only way to run the verdict program we would be safe. But, there is a case where the strparser may throw an ENOMEM error while parsing the skb. This is a result of a failed skb_clone, or alloc_skb_for_msg while building a new merged skb when the msg length needed spans multiple skbs. This will in turn put the skb on the strp_wrk workqueue in the strparser code. The skb will later be dequeued and verdict programs run, but now from a different context without the rcu_read_lock()/unlock() critical section in sk_psock_strp_data_ready() shown above. In practice I have not seen this yet, because as far as I know most users of the verdict programs are also only working on single skbs. In this case no merge happens which could trigger the above ENOMEM errors. In addition the system would need to be under memory pressure. For example, we can't hit the above case in selftests because we missed having tests to merge skbs. (Added in later patch) To fix the below splat extend the rcu_read_lock/unnlock block to include the call to sk_psock_tls_verdict_apply(). This will fix both TLS redirect case and non-TLS redirect+error case. Also remove psock from the sk_psock_tls_verdict_apply() function signature its not used there. [ 1095.937597] WARNING: suspicious RCU usage [ 1095.940964] 5.7.0-rc7-02911-g463bac5f1ca79 #1 Tainted: G W [ 1095.944363] ----------------------------- [ 1095.947384] include/linux/skmsg.h:284 suspicious rcu_dereference_check() usage! [ 1095.950866] [ 1095.950866] other info that might help us debug this: [ 1095.950866] [ 1095.957146] [ 1095.957146] rcu_scheduler_active = 2, debug_locks = 1 [ 1095.961482] 1 lock held by test_sockmap/15970: [ 1095.964501] #0: ffff9ea6b25de660 (sk_lock-AF_INET){+.+.}-{0:0}, at: tls_sw_recvmsg+0x13a/0x840 [tls] [ 1095.968568] [ 1095.968568] stack backtrace: [ 1095.975001] CPU: 1 PID: 15970 Comm: test_sockmap Tainted: G W 5.7.0-rc7-02911-g463bac5f1ca79 #1 [ 1095.977883] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 [ 1095.980519] Call Trace: [ 1095.982191] dump_stack+0x8f/0xd0 [ 1095.984040] sk_psock_skb_redirect+0xa6/0xf0 [ 1095.986073] sk_psock_tls_strp_read+0x1d8/0x250 [ 1095.988095] tls_sw_recvmsg+0x714/0x840 [tls] v2: Improve commit message to identify non-TLS redirect plus error case condition as well as more common TLS case. In the process I decided doing the rcu_read_unlock followed by the lock/unlock inside branches was unnecessarily complex. We can just extend the current rcu block and get the same effeective without the shuffling and branching. Thanks Martin! Fixes: e91de6afa81c1 ("bpf: Fix running sk_skb program types with ktls") Reported-by: Jakub Sitnicki Reported-by: kernel test robot Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/159312677907.18340.11064813152758406626.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 351afbf6bfba..c41ab6906b21 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -683,7 +683,7 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp) return container_of(parser, struct sk_psock, parser); } -static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) +static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -715,12 +715,11 @@ static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) } } -static void sk_psock_tls_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict) { switch (verdict) { case __SK_REDIRECT: - sk_psock_skb_redirect(psock, skb); + sk_psock_skb_redirect(skb); break; case __SK_PASS: case __SK_DROP: @@ -741,8 +740,8 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); } + sk_psock_tls_verdict_apply(skb, ret); rcu_read_unlock(); - sk_psock_tls_verdict_apply(psock, skb, ret); return ret; } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); @@ -770,7 +769,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } goto out_free; case __SK_REDIRECT: - sk_psock_skb_redirect(psock, skb); + sk_psock_skb_redirect(skb); break; case __SK_DROP: /* fall-through */ @@ -794,8 +793,8 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) ret = sk_psock_bpf_run(psock, prog, skb); ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); } - rcu_read_unlock(); sk_psock_verdict_apply(psock, skb, ret); + rcu_read_unlock(); } static int sk_psock_strp_read_done(struct strparser *strp, int err) -- cgit v1.2.3 From 8025751d4d55a2f32be6bdf825b6a80c299875f5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 25 Jun 2020 16:13:18 -0700 Subject: bpf, sockmap: RCU dereferenced psock may be used outside RCU block If an ingress verdict program specifies message sizes greater than skb->len and there is an ENOMEM error due to memory pressure we may call the rcv_msg handler outside the strp_data_ready() caller context. This is because on an ENOMEM error the strparser will retry from a workqueue. The caller currently protects the use of psock by calling the strp_data_ready() inside a rcu_read_lock/unlock block. But, in above workqueue error case the psock is accessed outside the read_lock/unlock block of the caller. So instead of using psock directly we must do a look up against the sk again to ensure the psock is available. There is an an ugly piece here where we must handle the case where we paused the strp and removed the psock. On psock removal we first pause the strparser and then remove the psock. If the strparser is paused while an skb is scheduled on the workqueue the skb will be dropped on the flow and kfree_skb() is called. If the workqueue manages to get called before we pause the strparser but runs the rcvmsg callback after the psock is removed we will hit the unlikely case where we run the sockmap rcvmsg handler but do not have a psock. For now we will follow strparser logic and drop the skb on the floor with skb_kfree(). This is ugly because the data is dropped. To date this has not caused problems in practice because either the application controlling the sockmap is coordinating with the datapath so that skbs are "flushed" before removal or we simply wait for the sock to be closed before removing it. This patch fixes the describe RCU bug and dropping the skb doesn't make things worse. Future patches will improve this by allowing the normal case where skbs are not merged to skip the strparser altogether. In practice many (most?) use cases have no need to merge skbs so its both a code complexity hit as seen above and a performance issue. For example, in the Cilium case we always set the strparser up to return sbks 1:1 without any merging and have avoided above issues. Fixes: e91de6afa81c1 ("bpf: Fix running sk_skb program types with ktls") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/159312679888.18340.15248924071966273998.stgit@john-XPS-13-9370 --- net/core/skmsg.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index c41ab6906b21..6a32a1fd34f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -781,11 +781,18 @@ out_free: static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; + struct sock *sk; rcu_read_lock(); + sk = strp->sk; + psock = sk_psock(sk); + if (unlikely(!psock)) { + kfree_skb(skb); + goto out; + } prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_orphan(skb); @@ -794,6 +801,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); +out: rcu_read_unlock(); } -- cgit v1.2.3 From e8280338c778a3f81477624267c9fa47f931477b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 26 Jun 2020 11:25:27 -0700 Subject: net: explain the lockdep annotations for dev_uc_unsync() The lockdep annotations for dev_uc_unsync() and dev_mc_unsync() are not easy to understand, so add some comments to explain why they are correct. Similar for the rest netif_addr_lock_bh() cases, they don't need nested version. Cc: Taehee Yoo Cc: Dmitry Vyukov Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 6393ba930097..54cd568e7c2f 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -690,6 +690,15 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return; + /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two + * reasons: + * 1) This is always called without any addr_list_lock, so as the + * outermost one here, it must be 0. + * 2) This is called by some callers after unlinking the upper device, + * so the dev->lower_level becomes 1 again. + * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or + * larger. + */ netif_addr_lock_bh(from); netif_addr_lock_nested(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); @@ -911,6 +920,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return; + /* See the above comments inside dev_uc_unsync(). */ netif_addr_lock_bh(from); netif_addr_lock_nested(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); -- cgit v1.2.3 From 3b7016996c4c44db5d499d98759b82fb714bb912 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:54 +0200 Subject: flow_dissector: Pull BPF program assignment up to bpf-netns Prepare for using bpf_prog_array to store attached programs by moving out code that updates the attached program out of flow dissector. Managing bpf_prog_array is more involved than updating a single bpf_prog pointer. This will let us do it all from one place, bpf/net_namespace.c, in the subsequent patch. No functional change intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-2-jakub@cloudflare.com --- net/core/flow_dissector.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d02df0b6d0d9..b57fb1359395 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -70,10 +70,10 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, EXPORT_SYMBOL(skb_flow_dissector_init); #ifdef CONFIG_BPF_SYSCALL -int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) +int flow_dissector_bpf_prog_attach_check(struct net *net, + struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; - struct bpf_prog *attached; if (net == &init_net) { /* BPF flow dissector in the root namespace overrides @@ -97,15 +97,6 @@ int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) return -EEXIST; } - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (attached == prog) - /* The same program cannot be attached twice */ - return -EINVAL; - - rcu_assign_pointer(net->bpf.progs[type], prog); - if (attached) - bpf_prog_put(attached); return 0; } #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 695c12147a40181fe9221d321c3f2de33c9574ed Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:55 +0200 Subject: bpf, netns: Keep attached programs in bpf_prog_array Prepare for having multi-prog attachments for new netns attach types by storing programs to run in a bpf_prog_array, which is well suited for iterating over programs and running them in sequence. After this change bpf(PROG_QUERY) may block to allocate memory in bpf_prog_array_copy_to_user() for collected program IDs. This forces a change in how we protect access to the attached program in the query callback. Because bpf_prog_array_copy_to_user() can sleep, we switch from an RCU read lock to holding a mutex that serializes updaters. Because we allow only one BPF flow_dissector program to be attached to netns at all times, the bpf_prog_array pointed by net->bpf.run_array is always either detached (null) or one element long. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200625141357.910330-3-jakub@cloudflare.com --- net/core/flow_dissector.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b57fb1359395..142a8824f0a8 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -86,14 +86,14 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->bpf.progs[type])) + if (rcu_access_pointer(ns->bpf.run_array[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.bpf.progs[type])) + if (rcu_access_pointer(init_net.bpf.run_array[type])) return -EEXIST; } @@ -894,7 +894,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; - struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; bool mpls_el = false; @@ -951,14 +950,14 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + struct bpf_prog_array *run_array; rcu_read_lock(); - attached = rcu_dereference(init_net.bpf.progs[type]); + run_array = rcu_dereference(init_net.bpf.run_array[type]); + if (!run_array) + run_array = rcu_dereference(net->bpf.run_array[type]); - if (!attached) - attached = rcu_dereference(net->bpf.progs[type]); - - if (attached) { + if (run_array) { struct bpf_flow_keys flow_keys; struct bpf_flow_dissector ctx = { .flow_keys = &flow_keys, @@ -966,6 +965,7 @@ bool __skb_flow_dissect(const struct net *net, .data_end = data + hlen, }; __be16 n_proto = proto; + struct bpf_prog *prog; if (skb) { ctx.skb = skb; @@ -976,7 +976,8 @@ bool __skb_flow_dissect(const struct net *net, n_proto = skb->protocol; } - ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, + prog = READ_ONCE(run_array->items[0].prog); + ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); -- cgit v1.2.3 From 9b2b09717e1812e450782a43ca0c2790651cf380 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:27 +0100 Subject: bpf: sockmap: Check value of unused args to BPF_PROG_ATTACH Using BPF_PROG_ATTACH on a sockmap program currently understands no flags or replace_bpf_fd, but accepts any value. Return EINVAL instead. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-4-lmb@cloudflare.com --- net/core/sock_map.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4059f94e9bb5..58016a5c63ff 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -70,6 +70,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) struct fd f; int ret; + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) -- cgit v1.2.3 From bb0de3131f4c60a9bf976681e0fe4d1e55c7a821 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:28 +0100 Subject: bpf: sockmap: Require attach_bpf_fd when detaching a program The sockmap code currently ignores the value of attach_bpf_fd when detaching a program. This is contrary to the usual behaviour of checking that attach_bpf_fd represents the currently attached program. Ensure that attach_bpf_fd is indeed the currently attached program. It turns out that all sockmap selftests already do this, which indicates that this is unlikely to cause breakage. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-5-lmb@cloudflare.com --- net/core/sock_map.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 58016a5c63ff..0971f17e8e54 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -77,7 +77,42 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, attr->attach_type); + ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + fdput(f); + return ret; +} + +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + u32 ufd = attr->target_fd; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + int ret; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + prog = bpf_prog_get(attr->attach_bpf_fd); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto put_map; + } + + if (prog->type != ptype) { + ret = -EINVAL; + goto put_prog; + } + + ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); +put_prog: + bpf_prog_put(prog); +put_map: fdput(f); return ret; } @@ -1206,27 +1241,32 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) } int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - u32 which) + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - psock_set_prog(&progs->msg_parser, prog); + pprog = &progs->msg_parser; break; case BPF_SK_SKB_STREAM_PARSER: - psock_set_prog(&progs->skb_parser, prog); + pprog = &progs->skb_parser; break; case BPF_SK_SKB_STREAM_VERDICT: - psock_set_prog(&progs->skb_verdict, prog); + pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + if (old) + return psock_replace_prog(pprog, prog, old); + + psock_set_prog(pprog, prog); return 0; } -- cgit v1.2.3 From d7bf2ebebc2bd61ab95e2a8e33541ef282f303d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 3 Jul 2020 22:26:43 +0200 Subject: sched: consistently handle layer3 header accesses in the presence of VLANs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a couple of places in net/sched/ that check skb->protocol and act on the value there. However, in the presence of VLAN tags, the value stored in skb->protocol can be inconsistent based on whether VLAN acceleration is enabled. The commit quoted in the Fixes tag below fixed the users of skb->protocol to use a helper that will always see the VLAN ethertype. However, most of the callers don't actually handle the VLAN ethertype, but expect to find the IP header type in the protocol field. This means that things like changing the ECN field, or parsing diffserv values, stops working if there's a VLAN tag, or if there are multiple nested VLAN tags (QinQ). To fix this, change the helper to take an argument that indicates whether the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we make sure to skip all of them, so behaviour is consistent even in QinQ mode. To make the helper usable from the ECN code, move it to if_vlan.h instead of pkt_sched.h. v3: - Remove empty lines - Move vlan variable definitions inside loop in skb_protocol() - Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and bpf_skb_ecn_set_ce() v2: - Use eth_type_vlan() helper in skb_protocol() - Also fix code that reads skb->protocol directly - Change a couple of 'if/else if' statements to switch constructs to avoid calling the helper twice Reported-by: Ilya Ponetayev Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/core/filter.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 73395384afe2..82e1b5b06167 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5853,12 +5853,16 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; - if (skb->protocol == cpu_to_be16(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); - else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + break; + case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); - else + break; + default: return 0; + } if (skb_headlen(skb) < iphdr_len) return 0; -- cgit v1.2.3 From ad0f75e5f57ccbceec13274e1e242f2b5a6397ed Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Jul 2020 11:52:56 -0700 Subject: cgroup: fix cgroup_sk_alloc() for sk_clone_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we clone a socket in sk_clone_lock(), its sk_cgrp_data is copied, so the cgroup refcnt must be taken too. And, unlike the sk_alloc() path, sock_update_netprioidx() is not called here. Therefore, it is safe and necessary to grab the cgroup refcnt even when cgroup_sk_alloc is disabled. sk_clone_lock() is in BH context anyway, the in_interrupt() would terminate this function if called there. And for sk_alloc() skcd->val is always zero. So it's safe to factor out the code to make it more readable. The global variable 'cgroup_sk_alloc_disabled' is used to determine whether to take these reference counts. It is impossible to make the reference counting correct unless we save this bit of information in skcd->val. So, add a new bit there to record whether the socket has already taken the reference counts. This obviously relies on kmalloc() to align cgroup pointers to at least 4 bytes, ARCH_KMALLOC_MINALIGN is certainly larger than that. This bug seems to be introduced since the beginning, commit d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") tried to fix it but not compeletely. It seems not easy to trigger until the recent commit 090e28b229af ("netprio_cgroup: Fix unlimited memory leak of v2 cgroups") was merged. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Cameron Berkenpas Reported-by: Peter Geis Reported-by: Lu Fengqi Reported-by: Daniël Sonck Reported-by: Zhang Qiang Tested-by: Cameron Berkenpas Tested-by: Peter Geis Tested-by: Thomas Lamprecht Cc: Daniel Borkmann Cc: Zefan Li Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index d832c650287c..2e5b7870e5d3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1926,7 +1926,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; - cgroup_sk_alloc(&newsk->sk_cgrp_data); + cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); -- cgit v1.2.3