From 49ca6153208f6efc409c1deb82dd5bcbb519d7e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 24 Aug 2021 09:39:31 +0200 Subject: bpf: Relicense disassembler as GPL-2.0-only OR BSD-2-Clause Some time ago we dual-licensed both libbpf and bpftool through commits 1bc38b8ff6cc ("libbpf: relicense libbpf as LGPL-2.1 OR BSD-2-Clause") and 907b22365115 ("tools: bpftool: dual license all files"). The latter missed the disasm.{c,h} which we pull in via kernel/bpf/ such that we have a single source for verifier as well as bpftool asm dumping, see also f4ac7e0b5cc8 ("bpf: move instruction printing into a separate file"). It is currently GPL-2.0-only and missed the conversion in 907b22365115, therefore relicense the two as GPL-2.0-only OR BSD-2-Clause as well. Spotted-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Brendan Jackman Acked-by: Jakub Kicinski Acked-by: Jiri Olsa Acked-by: Simon Horman Acked-by: Martin KaFai Lau Acked-by: Xu Kuohai Acked-by: Edward Cree --- kernel/bpf/disasm.c | 2 +- kernel/bpf/disasm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ca3cd9aaa6ce..7b4afb7d96db 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e546b18d27da..a4b040793f44 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ -- cgit v1.2.3 From 2f1aaf3ea666b737ad717b3d88667225aca23149 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 9 Sep 2021 08:49:59 -0700 Subject: bpf, mm: Fix lockdep warning triggered by stack_map_get_build_id_offset() Currently the bpf selftest "get_stack_raw_tp" triggered the warning: [ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0 [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod] [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G W O 5.14.0+ #53 [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0 [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8 [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246 [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000 [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0 [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0 [ 1411.304501] FS: 00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000 [ 1411.304504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0 [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.304512] Call Trace: [ 1411.304517] stack_map_get_build_id_offset+0x17c/0x260 [ 1411.304528] __bpf_get_stack+0x18f/0x230 [ 1411.304541] bpf_get_stack_raw_tp+0x5a/0x70 [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000 [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04 [ 1411.306978] bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0 [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108 [ 1411.309074] bpf_trace_run2+0x8f/0x1a0 [ 1411.309891] FS: 00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000 [ 1411.309896] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.311221] syscall_trace_enter.isra.20+0x161/0x1f0 [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0 [ 1411.312291] do_syscall_64+0x15/0x80 [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.313803] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.315082] RIP: 0033:0x7f00ad80a0e0 [ 1411.315626] Call Trace: [ 1411.315632] stack_map_get_build_id_offset+0x17c/0x260 To reproduce, first build `test_progs` binary: make -C tools/testing/selftests/bpf -j60 and then run the binary at tools/testing/selftests/bpf directory: ./test_progs -t get_stack_raw_tp The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") which added mmap_assert_locked() in find_vma() function. The mmap_assert_locked() function asserts that mm->mmap_lock needs to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid() helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner() instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case, the above warning is emitted during test run. This patch fixed the issue by (1). using mmap_read_trylock() instead of mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and (2). droping lockdep for mmap_lock right before the irq_work_queue(). The function mmap_read_trylock_non_owner() is also removed since after this patch nobody calls it any more. Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") Suggested-by: Jason Gunthorpe Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Reviewed-by: Liam R. Howlett Cc: Luigi Rizzo Cc: Jason Gunthorpe Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com --- kernel/bpf/stackmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e8eefdf8cf3e..09a3fd97d329 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -179,7 +179,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock_non_owner(current->mm)) { + !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -204,9 +204,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock_non_owner(current->mm); + mmap_read_unlock(current->mm); } else { work->mm = current->mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } -- cgit v1.2.3 From 0e6491b559704da720f6da09dd0a52c4df44c514 Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Sat, 11 Sep 2021 08:55:57 +0800 Subject: bpf: Add oversize check before call kvcalloc() Commit 7661809d493b ("mm: don't allow oversized kvmalloc() calls") add the oversize check. When the allocation is larger than what kmalloc() supports, the following warning triggered: WARNING: CPU: 0 PID: 8408 at mm/util.c:597 kvmalloc_node+0x108/0x110 mm/util.c:597 Modules linked in: CPU: 0 PID: 8408 Comm: syz-executor221 Not tainted 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:kvmalloc_node+0x108/0x110 mm/util.c:597 Call Trace: kvmalloc include/linux/mm.h:806 [inline] kvmalloc_array include/linux/mm.h:824 [inline] kvcalloc include/linux/mm.h:829 [inline] check_btf_line kernel/bpf/verifier.c:9925 [inline] check_btf_info kernel/bpf/verifier.c:10049 [inline] bpf_check+0xd634/0x150d0 kernel/bpf/verifier.c:13759 bpf_prog_load kernel/bpf/syscall.c:2301 [inline] __sys_bpf+0x11181/0x126e0 kernel/bpf/syscall.c:4587 __do_sys_bpf kernel/bpf/syscall.c:4691 [inline] __se_sys_bpf kernel/bpf/syscall.c:4689 [inline] __x64_sys_bpf+0x78/0x90 kernel/bpf/syscall.c:4689 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+f3e749d4c662818ae439@syzkaller.appspotmail.com Signed-off-by: Bixuan Cui Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210911005557.45518-1-cuibixuan@huawei.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 047ac4b4703b..e76b55917905 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9912,6 +9912,8 @@ static int check_btf_line(struct bpf_verifier_env *env, nr_linfo = attr->line_info_cnt; if (!nr_linfo) return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; rec_size = attr->line_info_rec_size; if (rec_size < MIN_BPF_LINEINFO_SIZE || -- cgit v1.2.3 From 8520e224f547cd070c7c8f97b1fc6d58cff7ccaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Sep 2021 01:07:57 +0200 Subject: bpf, cgroups: Fix cgroup v2 fallback on v1/v2 mixed mode Fix cgroup v1 interference when non-root cgroup v2 BPF programs are used. Back in the days, commit bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") embedded per-socket cgroup information into sock->sk_cgrp_data and in order to save 8 bytes in struct sock made both mutually exclusive, that is, when cgroup v1 socket tagging (e.g. net_cls/net_prio) is used, then cgroup v2 falls back to the root cgroup in sock_cgroup_ptr() (&cgrp_dfl_root.cgrp). The assumption made was "there is no reason to mix the two and this is in line with how legacy and v2 compatibility is handled" as stated in bd1060a1d671. However, with Kubernetes more widely supporting cgroups v2 as well nowadays, this assumption no longer holds, and the possibility of the v1/v2 mixed mode with the v2 root fallback being hit becomes a real security issue. Many of the cgroup v2 BPF programs are also used for policy enforcement, just to pick _one_ example, that is, to programmatically deny socket related system calls like connect(2) or bind(2). A v2 root fallback would implicitly cause a policy bypass for the affected Pods. In production environments, we have recently seen this case due to various circumstances: i) a different 3rd party agent and/or ii) a container runtime such as [0] in the user's environment configuring legacy cgroup v1 net_cls tags, which triggered implicitly mentioned root fallback. Another case is Kubernetes projects like kind [1] which create Kubernetes nodes in a container and also add cgroup namespaces to the mix, meaning programs which are attached to the cgroup v2 root of the cgroup namespace get attached to a non-root cgroup v2 path from init namespace point of view. And the latter's root is out of reach for agents on a kind Kubernetes node to configure. Meaning, any entity on the node setting cgroup v1 net_cls tag will trigger the bypass despite cgroup v2 BPF programs attached to the namespace root. Generally, this mutual exclusiveness does not hold anymore in today's user environments and makes cgroup v2 usage from BPF side fragile and unreliable. This fix adds proper struct cgroup pointer for the cgroup v2 case to struct sock_cgroup_data in order to address these issues; this implicitly also fixes the tradeoffs being made back then with regards to races and refcount leaks as stated in bd1060a1d671, and removes the fallback, so that cgroup v2 BPF programs always operate as expected. [0] https://github.com/nestybox/sysbox/ [1] https://kind.sigs.k8s.io/ Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Stanislav Fomichev Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20210913230759.2313-1-daniel@iogearbox.net --- kernel/cgroup/cgroup.c | 50 ++++++++++---------------------------------------- 1 file changed, 10 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470beb..8afa8690d288 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6572,74 +6572,44 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) return; rcu_read_lock(); - while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; + skcd->cgroup = cset->dfl_cgrp; cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); } - rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } -- cgit v1.2.3