summaryrefslogtreecommitdiffstats
path: root/include/linux/bpf-cgroup.h
diff options
context:
space:
mode:
authorRoman Gushchin <guro@fb.com>2019-05-25 09:37:39 -0700
committerAlexei Starovoitov <ast@kernel.org>2019-05-28 09:30:02 -0700
commit4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a (patch)
treef392f37509246a1155c20d9f1c69857d0241e08a /include/linux/bpf-cgroup.h
parent37b54aed123faa19eb21d7ef2534756c5a152a7c (diff)
downloadlinux-4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a.tar.gz
linux-4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a.tar.bz2
linux-4bfc0bb2c60e2f4cc8eb60f03cf8dfa72336272a.zip
bpf: decouple the lifetime of cgroup_bpf from cgroup itself
Currently the lifetime of bpf programs attached to a cgroup is bound to the lifetime of the cgroup itself. It means that if a user forgets (or intentionally avoids) to detach a bpf program before removing the cgroup, it will stay attached up to the release of the cgroup. Since the cgroup can stay in the dying state (the state between being rmdir()'ed and being released) for a very long time, it leads to a waste of memory. Also, it blocks a possibility to implement the memcg-based memory accounting for bpf objects, because a circular reference dependency will occur. Charged memory pages are pinning the corresponding memory cgroup, and if the memory cgroup is pinning the attached bpf program, nothing will be ever released. A dying cgroup can not contain any processes, so the only chance for an attached bpf program to be executed is a live socket associated with the cgroup. So in order to release all bpf data early, let's count associated sockets using a new percpu refcounter. On cgroup removal the counter is transitioned to the atomic mode, and as soon as it reaches 0, all bpf programs are detached. Because cgroup_bpf_release() can block, it can't be called from the percpu ref counter callback directly, so instead an asynchronous work is scheduled. The reference counter is not socket specific, and can be used for any other types of programs, which can be executed from a cgroup-bpf hook outside of the process context, had such a need arise in the future. Signed-off-by: Roman Gushchin <guro@fb.com> Cc: jolsa@redhat.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'include/linux/bpf-cgroup.h')
-rw-r--r--include/linux/bpf-cgroup.h11
1 files changed, 9 insertions, 2 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index cb3c6b3b89c8..9f100fc422c3 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -6,6 +6,7 @@
#include <linux/errno.h>
#include <linux/jump_label.h>
#include <linux/percpu.h>
+#include <linux/percpu-refcount.h>
#include <linux/rbtree.h>
#include <uapi/linux/bpf.h>
@@ -72,10 +73,16 @@ struct cgroup_bpf {
/* temp storage for effective prog array used by prog_attach/detach */
struct bpf_prog_array __rcu *inactive;
+
+ /* reference counter used to detach bpf programs after cgroup removal */
+ struct percpu_ref refcnt;
+
+ /* cgroup_bpf is released using a work queue */
+ struct work_struct release_work;
};
-void cgroup_bpf_put(struct cgroup *cgrp);
int cgroup_bpf_inherit(struct cgroup *cgrp);
+void cgroup_bpf_offline(struct cgroup *cgrp);
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, u32 flags);
@@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
struct bpf_prog;
struct cgroup_bpf {};
-static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
+static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype,