From 1ed1328792ff46e4bb86a3d7f7be2971f4549f6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Sep 2015 12:53:17 -0400 Subject: sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem Note: This commit was originally committed as d59cfc09c32a but got reverted by 0c986253b939 due to the performance regression from the percpu_rwsem write down/up operations added to cgroup task migration path. percpu_rwsem changes which alleviate the performance issue are pending for v4.4-rc1 merge window. Re-apply. The cgroup side of threadgroup locking uses signal_struct->group_rwsem to synchronize against threadgroup changes. This per-process rwsem adds small overhead to thread creation, exit and exec paths, forces cgroup code paths to do lock-verify-unlock-retry dance in a couple places and makes it impossible to atomically perform operations across multiple processes. This patch replaces signal_struct->group_rwsem with a global percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader side and contained in cgroups proper. This patch converts one-to-one. This does make writer side heavier and lower the granularity; however, cgroup process migration is a fairly cold path, we do want to optimize thread operations over it and cgroup migration operations don't take enough time for the lower granularity to matter. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/cgroup-defs.h | 27 +++++++++++++++++++++++++-- include/linux/init_task.h | 8 -------- include/linux/sched.h | 12 ------------ 3 files changed, 25 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8492721b39be..4d8fcf2187dc 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -473,8 +473,31 @@ struct cgroup_subsys { unsigned int depends_on; }; -void cgroup_threadgroup_change_begin(struct task_struct *tsk); -void cgroup_threadgroup_change_end(struct task_struct *tsk); +extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; + +/** + * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_begin() and allows cgroup operations to + * synchronize against threadgroup changes using a percpu_rw_semaphore. + */ +static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) +{ + percpu_down_read(&cgroup_threadgroup_rwsem); +} + +/** + * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups + * @tsk: target task + * + * Called from threadgroup_change_end(). Counterpart of + * cgroup_threadcgroup_change_begin(). + */ +static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) +{ + percpu_up_read(&cgroup_threadgroup_rwsem); +} #else /* CONFIG_CGROUPS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e38681f4912d..d0b380ee7d67 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,13 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; -#ifdef CONFIG_CGROUPS -#define INIT_GROUP_RWSEM(sig) \ - .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), -#else -#define INIT_GROUP_RWSEM(sig) -#endif - #ifdef CONFIG_CPUSETS #define INIT_CPUSET_SEQ(tsk) \ .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), @@ -64,7 +57,6 @@ extern struct fs_struct init_fs; INIT_PREV_CPUTIME(sig) \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ - INIT_GROUP_RWSEM(sig) \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index b7b9501b41af..a4ab9daa387c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -762,18 +762,6 @@ struct signal_struct { unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif -#ifdef CONFIG_CGROUPS - /* - * group_rwsem prevents new tasks from entering the threadgroup and - * member tasks from exiting,a more specifically, setting of - * PF_EXITING. fork and exit paths are protected with this rwsem - * using threadgroup_change_begin/end(). Users which require - * threadgroup to remain stable should use threadgroup_[un]lock() - * which also takes care of exec path. Currently, cgroup is the - * only user. - */ - struct rw_semaphore group_rwsem; -#endif oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ -- cgit v1.2.3 From fa128fd735bd236b6b04d3fedfed7a784137c185 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 11:56:28 -0400 Subject: jump_label: make static_key_enabled() work on static_key_true/false types too static_key_enabled() can be used on struct static_key but not on its wrapper types static_key_true and static_key_false. The function is useful for debugging and management of static keys. Update it so that it can be used for the wrapper types too. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Cc: Andrew Morton --- include/linux/jump_label.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7f653e8f6690..c9ca050de846 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -216,11 +216,6 @@ static inline int jump_label_apply_nops(struct module *mod) #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled -static inline bool static_key_enabled(struct static_key *key) -{ - return static_key_count(key) > 0; -} - static inline void static_key_enable(struct static_key *key) { int count = static_key_count(key); @@ -267,6 +262,17 @@ struct static_key_false { #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT +extern bool ____wrong_branch_error(void); + +#define static_key_enabled(x) \ +({ \ + if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \ + !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\ + !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ + ____wrong_branch_error(); \ + static_key_count((struct static_key *)x) > 0; \ +}) + #ifdef HAVE_JUMP_LABEL /* @@ -325,8 +331,6 @@ struct static_key_false { * See jump_label_type() / jump_label_init_type(). */ -extern bool ____wrong_branch_error(void); - #define static_branch_likely(x) \ ({ \ bool branch; \ -- cgit v1.2.3 From 49d1dc4b81797f88270832b11e9f73809e7e7209 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 11:56:28 -0400 Subject: cgroup: implement static_key based cgroup_subsys_enabled() and cgroup_subsys_on_dfl() Whether a subsys is enabled and attached to the default hierarchy seldom changes and may be tested in the hot paths. This patch implements static_key based cgroup_subsys_enabled() and cgroup_subsys_on_dfl() tests. The following patches will update the users and remove duplicate mechanisms. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- include/linux/cgroup.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index eb7ca55f72ef..c3a9f1eb4097 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -50,6 +51,26 @@ extern struct css_set init_css_set; #include #undef SUBSYS +#define SUBSYS(_x) \ + extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ + extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; +#include +#undef SUBSYS + +/** + * cgroup_subsys_enabled - fast test on whether a subsys is enabled + * @ss: subsystem in question + */ +#define cgroup_subsys_enabled(ss) \ + static_branch_likely(&ss ## _enabled_key) + +/** + * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy + * @ss: subsystem in question + */ +#define cgroup_subsys_on_dfl(ss) \ + static_branch_likely(&ss ## _on_dfl_key) + bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, -- cgit v1.2.3 From fc5ed1e95410ad73b2ab8f33cd90eb3bcf6c98a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 11:56:28 -0400 Subject: cgroup: replace cgroup_subsys->disabled tests with cgroup_subsys_enabled() Replace cgroup_subsys->disabled tests in controllers with cgroup_subsys_enabled(). cgroup_subsys_enabled() requires literal subsys name as its parameter and thus can't be used for cgroup core which iterates through controllers. For cgroup core, introduce and use cgroup_ssid_enabled() which uses slower static_key_enabled() test and can be indexed by subsys ID. This leaves cgroup_subsys->disabled unused. Removed. Signed-off-by: Tejun Heo Acked-by: Zefan Li Cc: Johannes Weiner Cc: Michal Hocko --- include/linux/cgroup-defs.h | 1 - include/linux/hugetlb_cgroup.h | 4 +--- include/linux/memcontrol.h | 4 +--- 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4d8fcf2187dc..c5d41c3c1f00 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -419,7 +419,6 @@ struct cgroup_subsys { struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); - int disabled; int early_init; /* diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index bcc853eccc85..7edd30515298 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -48,9 +48,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) static inline bool hugetlb_cgroup_disabled(void) { - if (hugetlb_cgrp_subsys.disabled) - return true; - return false; + return !cgroup_subsys_enabled(hugetlb_cgrp_subsys); } extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad800e62cb7a..9aa7820c2177 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -347,9 +347,7 @@ ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_disabled(void) { - if (memory_cgrp_subsys.disabled) - return true; - return false; + return !cgroup_subsys_enabled(memory_cgrp_subsys); } /* -- cgit v1.2.3 From 9e10a130d9b62af976d17d120c95f3650769312c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 11:56:28 -0400 Subject: cgroup: replace cgroup_on_dfl() tests in controllers with cgroup_subsys_on_dfl() cgroup_on_dfl() tests whether the cgroup's root is the default hierarchy; however, an individual controller is only interested in whether the controller is attached to the default hierarchy and never tests a cgroup which doesn't belong to the hierarchy that the controller is attached to. This patch replaces cgroup_on_dfl() tests in controllers with faster static_key based cgroup_subsys_on_dfl(). This leaves cgroup core as the only user of cgroup_on_dfl() and the function is moved from the header file to cgroup.c. Signed-off-by: Tejun Heo Acked-by: Zefan Li Cc: Vivek Goyal Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko --- include/linux/cgroup.h | 58 -------------------------------------------------- 1 file changed, 58 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c3a9f1eb4097..355bf2ed8867 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -433,64 +433,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } -/** - * cgroup_on_dfl - test whether a cgroup is on the default hierarchy - * @cgrp: the cgroup of interest - * - * The default hierarchy is the v2 interface of cgroup and this function - * can be used to test whether a cgroup is on the default hierarchy for - * cases where a subsystem should behave differnetly depending on the - * interface version. - * - * The set of behaviors which change on the default hierarchy are still - * being determined and the mount option is prefixed with __DEVEL__. - * - * List of changed behaviors: - * - * - Mount options "noprefix", "xattr", "clone_children", "release_agent" - * and "name" are disallowed. - * - * - When mounting an existing superblock, mount options should match. - * - * - Remount is disallowed. - * - * - rename(2) is disallowed. - * - * - "tasks" is removed. Everything should be at process granularity. Use - * "cgroup.procs" instead. - * - * - "cgroup.procs" is not sorted. pids will be unique unless they got - * recycled inbetween reads. - * - * - "release_agent" and "notify_on_release" are removed. Replacement - * notification mechanism will be implemented. - * - * - "cgroup.clone_children" is removed. - * - * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup - * and its descendants contain no task; otherwise, 1. The file also - * generates kernfs notification which can be monitored through poll and - * [di]notify when the value of the file changes. - * - * - cpuset: tasks will be kept in empty cpusets when hotplug happens and - * take masks of ancestors with non-empty cpus/mems, instead of being - * moved to an ancestor. - * - * - cpuset: a task can be moved into an empty cpuset, and again it takes - * masks of ancestors. - * - * - memcg: use_hierarchy is on by default and the cgroup file for the flag - * is not created. - * - * - blkcg: blk-throttle becomes properly hierarchical. - * - * - debug: disallowed on the default hierarchy. - */ -static inline bool cgroup_on_dfl(const struct cgroup *cgrp) -{ - return cgrp->root == &cgrp_dfl_root; -} - /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_has_tasks(struct cgroup *cgrp) { -- cgit v1.2.3 From 4a07c222d3afb00e1113834fee38d23a8e5d71dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:22 -0400 Subject: cgroup: replace "cgroup.populated" with "cgroup.events" memcg already uses "memory.events" for event reporting and other controllers may need event reporting too. Let's standardize on "$SUBSYS.events" interface file for reporting events which don't happen too frequently and thus can share event notification. "cgroup.populated" is replaced with "populated" field in "cgroup.events" and documentation is updated accordingly. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- include/linux/cgroup-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index c5d41c3c1f00..d95cc88e9dc2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -226,7 +226,7 @@ struct cgroup { struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */ - struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ + struct kernfs_node *events_kn; /* kn for "cgroup.events" */ /* * The bitmask of subsystems enabled on the child cgroups. -- cgit v1.2.3 From 7dbdb199d3bf88f043ea17e97113eb28d5b100bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:23 -0400 Subject: cgroup: replace cftype->mode with CFTYPE_WORLD_WRITABLE cftype->mode allows controllers to give arbitrary permissions to interface knobs. Except for "cgroup.event_control", the existing uses are spurious. * Some explicitly specify S_IRUGO | S_IWUSR even though that's the default. * "cpuset.memory_pressure" specifies S_IRUGO while also setting a write callback which returns -EACCES. All it needs to do is simply not setting a write callback. "cgroup.event_control" uses cftype->mode to make the file world-writable. It's a misdesigned interface and we don't want controllers to be tweaking interface file permissions in general. This patch removes cftype->mode and all its spurious uses and implements CFTYPE_WORLD_WRITABLE for "cgroup.event_control" which is marked as compatibility-only. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- include/linux/cgroup-defs.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index d95cc88e9dc2..10d814bcd487 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -76,6 +76,7 @@ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ + CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ @@ -324,11 +325,6 @@ struct cftype { */ char name[MAX_CFTYPE_NAME]; unsigned long private; - /* - * If not 0, file mode is set to this value, otherwise it will - * be figured out automatically - */ - umode_t mode; /* * The maximum length of string, excluding trailing nul, that can -- cgit v1.2.3 From 6f60eade2433cb3a38687d5f8a4f44b92c6c51bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 17:54:23 -0400 Subject: cgroup: generalize obtaining the handles of and notifying cgroup files cgroup core handles creations and removals of cgroup interface files as described by cftypes. There are cases where the handle for a given file instance is necessary, for example, to generate a file modified event. Currently, this is handled by explicitly matching the callback method pointer and storing the file handle manually in cgroup_add_file(). While this simple approach works for cgroup core files, it can't for controller interface files. This patch generalizes cgroup interface file handle handling. struct cgroup_file is defined and each cftype can optionally tell cgroup core to store the file handle by setting ->file_offset. A file handle remains accessible as long as the containing css is accessible. Both "cgroup.procs" and "cgroup.events" are converted to use the new generic mechanism instead of hooking directly into cgroup_add_file(). Also, cgroup_file_notify() which takes a struct cgroup_file and generates a file modified event on it is added and replaces explicit kernfs_notify() invocations. This generalizes cgroup file handle handling and allows controllers to generate file modified notifications. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner --- include/linux/cgroup-defs.h | 26 ++++++++++++++++++++++++-- include/linux/cgroup.h | 13 +++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 10d814bcd487..df589a097539 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -83,6 +83,17 @@ enum { __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ }; +/* + * cgroup_file is the handle for a file instance created in a cgroup which + * is used, for example, to generate file changed notifications. This can + * be obtained by setting cftype->file_offset. + */ +struct cgroup_file { + /* do not access any fields from outside cgroup core */ + struct list_head node; /* anchored at css->files */ + struct kernfs_node *kn; +}; + /* * Per-subsystem/per-cgroup state maintained by the system. This is the * fundamental structural building block that controllers deal with. @@ -123,6 +134,9 @@ struct cgroup_subsys_state { */ u64 serial_nr; + /* all cgroup_files associated with this css */ + struct list_head files; + /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; @@ -226,8 +240,8 @@ struct cgroup { int populated_cnt; struct kernfs_node *kn; /* cgroup kernfs entry */ - struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */ - struct kernfs_node *events_kn; /* kn for "cgroup.events" */ + struct cgroup_file procs_file; /* handle for "cgroup.procs" */ + struct cgroup_file events_file; /* handle for "cgroup.events" */ /* * The bitmask of subsystems enabled on the child cgroups. @@ -335,6 +349,14 @@ struct cftype { /* CFTYPE_* flags */ unsigned int flags; + /* + * If non-zero, should contain the offset from the start of css to + * a struct cgroup_file field. cgroup will record the handle of + * the created file into it. The recorded handle can be used as + * long as the containing css remains accessible. + */ + unsigned int file_offset; + /* * Fields used for internal bookkeeping. Initialized automatically * during registration. diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 355bf2ed8867..fb717f2cba5b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -490,6 +490,19 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } +/** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +static inline void cgroup_file_notify(struct cgroup_file *cfile) +{ + /* might not have been created due to one of the CFTYPE selector flags */ + if (cfile->kn) + kernfs_notify(cfile->kn); +} + #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; -- cgit v1.2.3 From 472912a2b5e2027efd58aa47f78acb2373675187 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Sep 2015 18:01:59 -0400 Subject: memcg: generate file modified notifications on "memory.events" cgroup core only recently grew generic notification support. Wire up "memory.events" so that it triggers a file modified event whenever its content changes. v2: Refreshed on top of mem_cgroup relocation. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Li Zefan --- include/linux/memcontrol.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9aa7820c2177..c83c699a6605 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -213,6 +213,9 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; + /* handle for "memory.events" */ + struct cgroup_file events_file; + /* protect arrays of thresholds */ struct mutex thresholds_lock; @@ -286,6 +289,7 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg, unsigned int nr) { this_cpu_add(memcg->stat->events[idx], nr); + cgroup_file_notify(&memcg->events_file); } bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); -- cgit v1.2.3 From 4530eddb59494b89650d6bcd980fc7f7717ad80c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 11 Sep 2015 15:00:19 -0400 Subject: cgroup, memcg, cpuset: implement cgroup_taskset_for_each_leader() It wasn't explicitly documented but, when a process is being migrated, cpuset and memcg depend on cgroup_taskset_first() returning the threadgroup leader; however, this approach is somewhat ghetto and would no longer work for the planned multi-process migration. This patch introduces explicit cgroup_taskset_for_each_leader() which iterates over only the threadgroup leaders and replaces cgroup_taskset_first() usages for accessing the leader with it. This prepares both memcg and cpuset for multi-process migration. This patch also updates the documentation for cgroup_taskset_for_each() to clarify the iteration rules and removes comments mentioning task ordering in tasksets. v2: A previous patch which added threadgroup leader test was dropped. Patch updated accordingly. Signed-off-by: Tejun Heo Acked-by: Zefan Li Acked-by: Michal Hocko Cc: Johannes Weiner --- include/linux/cgroup.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index fb717f2cba5b..e9c3eac074e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -232,11 +232,33 @@ void css_task_iter_end(struct css_task_iter *it); * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @tset: taskset to iterate + * + * @tset may contain multiple tasks and they may belong to multiple + * processes. When there are multiple tasks in @tset, if a task of a + * process is in @tset, all tasks of the process are in @tset. Also, all + * are guaranteed to share the same source and destination csses. + * + * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, tset) \ for ((task) = cgroup_taskset_first((tset)); (task); \ (task) = cgroup_taskset_next((tset))) +/** + * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset + * @leader: the loop cursor + * @tset: takset to iterate + * + * Iterate threadgroup leaders of @tset. For single-task migrations, @tset + * may not contain any. + */ +#define cgroup_taskset_for_each_leader(leader, tset) \ + for ((leader) = cgroup_taskset_first((tset)); (leader); \ + (leader) = cgroup_taskset_next((tset))) \ + if ((leader) != (leader)->group_leader) \ + ; \ + else + /* * Inline functions. */ -- cgit v1.2.3 From 0de0942db2b36dd91c088a7950398d2e87f23b23 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:49 -0400 Subject: cgroup: make cgroup->nr_populated count the number of populated css_sets Currently, cgroup->nr_populated counts whether the cgroup has any css_sets linked to it and the number of children which has non-zero ->nr_populated. This works because a css_set's refcnt converges with the number of tasks linked to it and thus there's no css_set linked to a cgroup if it doesn't have any live tasks. To help tracking resource usage of zombie tasks, putting the ref of css_set will be separated from disassociating the task from the css_set which means that a cgroup may have css_sets linked to it even when it doesn't have any live tasks. This patch updates cgroup->nr_populated so that for the cgroup itself it counts the number of css_sets which have tasks associated with them so that empty css_sets don't skew the populated test. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index df589a097539..17444505c870 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -232,10 +232,10 @@ struct cgroup { int id; /* - * If this cgroup contains any tasks, it contributes one to - * populated_cnt. All children with non-zero popuplated_cnt of - * their own contribute one. The count is zero iff there's no task - * in this cgroup or its subtree. + * Each non-empty css_set associated with this cgroup contributes + * one to populated_cnt. All children with non-zero popuplated_cnt + * of their own contribute one. The count is zero iff there's no + * task in this cgroup or its subtree. */ int populated_cnt; -- cgit v1.2.3 From 27bd4dbb8d51c476298e62bd088225317b7853de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:50 -0400 Subject: cgroup: replace cgroup_has_tasks() with cgroup_is_populated() Currently, cgroup_has_tasks() tests whether the target cgroup has any css_set linked to it. This works because a css_set's refcnt converges with the number of tasks linked to it and thus there's no css_set linked to a cgroup if it doesn't have any live tasks. To help tracking resource usage of zombie tasks, putting the ref of css_set will be separated from disassociating the task from the css_set which means that a cgroup may have css_sets linked to it even when it doesn't have any live tasks. This patch replaces cgroup_has_tasks() with cgroup_is_populated() which tests cgroup->nr_populated instead which locally counts the number of populated css_sets. Unlike cgroup_has_tasks(), cgroup_is_populated() is recursive - if any of the descendants is populated, the cgroup is populated too. While this changes the meaning of the test, all the existing users are okay with the change. While at it, replace the open-coded ->populated_cnt test in cgroup_events_show() with cgroup_is_populated(). Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko --- include/linux/cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e9c3eac074e2..bdfdb3a1a83c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -456,9 +456,9 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, } /* no synchronization, the result can only be used as a hint */ -static inline bool cgroup_has_tasks(struct cgroup *cgrp) +static inline bool cgroup_is_populated(struct cgroup *cgrp) { - return !list_empty(&cgrp->cset_links); + return cgrp->populated_cnt; } /* returns ino associated with a cgroup */ -- cgit v1.2.3 From ed27b9f7a17ddfbc007e16d4d11f33dff4fc2de7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:52 -0400 Subject: cgroup: don't hold css_set_rwsem across css task iteration css_sets are synchronized through css_set_rwsem but the locking scheme is kinda bizarre. The hot paths - fork and exit - have to write lock the rwsem making the rw part pointless; furthermore, many readers already hold cgroup_mutex. One of the readers is css task iteration. It read locks the rwsem over the entire duration of iteration. This leads to silly locking behavior. When cpuset tries to migrate processes of a cgroup to a different NUMA node, css_set_rwsem is held across the entire migration attempt which can take a long time locking out forking, exiting and other cgroup operations. This patch updates css task iteration so that it locks css_set_rwsem only while the iterator is being advanced. css task iteration involves two levels - css_set and task iteration. As css_sets in use are practically immutable, simply pinning the current one is enough for resuming iteration afterwards. Task iteration is tricky as tasks may leave their css_set while iteration is in progress. This is solved by keeping track of active iterators and advancing them if their next task leaves its css_set. v2: put_task_struct() in css_task_iter_next() moved outside css_set_rwsem. A later patch will add cgroup operations to task_struct free path which may grab the same lock and this avoids deadlock possibilities. css_set_move_task() updated to use list_for_each_entry_safe() when walking task_iters and advancing them. This is necessary as advancing an iter may remove it from the list. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 3 +++ include/linux/cgroup.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 17444505c870..62413c3e2f4b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -211,6 +211,9 @@ struct css_set { */ struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + /* all css_task_iters currently walking this cset */ + struct list_head task_iters; + /* For RCU-protected deletion */ struct rcu_head rcu_head; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bdfdb3a1a83c..a9dcf0e76865 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -42,6 +42,10 @@ struct css_task_iter { struct list_head *task_pos; struct list_head *tasks_head; struct list_head *mg_tasks_head; + + struct css_set *cur_cset; + struct task_struct *cur_task; + struct list_head iters_node; /* css_set->task_iters */ }; extern struct cgroup_root cgrp_dfl_root; -- cgit v1.2.3 From f0d9a5f175753a371bc7fdff0d584a8d9cd72bb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:53 -0400 Subject: cgroup: make css_set_rwsem a spinlock and rename it to css_set_lock css_set_rwsem is the inner lock protecting css_sets and is accessed from hot paths such as fork and exit. Internally, it has no reason to be a rwsem or even mutex. There are no internal blocking operations while holding it. This was rwsem because css task iteration used to expose it to external iterator users. As the previous patch updated css task iteration such that the locking is not leaked to its users, there's no reason to keep it a rwsem. This patch converts css_set_rwsem to a spinlock and rename it to css_set_lock. It uses bh-safe operations as a planned usage needs to access it from RCU callback context. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a9dcf0e76865..46020735bcbb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -367,11 +366,11 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) */ #ifdef CONFIG_PROVE_RCU extern struct mutex cgroup_mutex; -extern struct rw_semaphore css_set_rwsem; +extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ lockdep_is_held(&cgroup_mutex) || \ - lockdep_is_held(&css_set_rwsem) || \ + lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ -- cgit v1.2.3 From 2e91fa7f6d451e3ea9fec999065d2fd199691f9d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:53 -0400 Subject: cgroup: keep zombies associated with their original cgroups cgroup_exit() is called when a task exits and disassociates the exiting task from its cgroups and half-attach it to the root cgroup. This is unnecessary and undesirable. No controller actually needs an exiting task to be disassociated with non-root cgroups. Both cpu and perf_event controllers update the association to the root cgroup from their exit callbacks just to keep consistent with the cgroup core behavior. Also, this disassociation makes it difficult to track resources held by zombies or determine where the zombies came from. Currently, pids controller is completely broken as it uncharges on exit and zombies always escape the resource restriction. With cgroup association being reset on exit, fixing it is pretty painful. There's no reason to reset cgroup membership on exit. The zombie can be removed from its css_set so that it doesn't show up on "cgroup.procs" and thus can't be migrated or interfere with cgroup removal. It can still pin and point to the css_set so that its cgroup membership is maintained. This patch makes cgroup core keep zombies associated with their cgroups at the time of exit. * Previous patches decoupled populated_cnt tracking from css_set lifetime, so a dying task can be simply unlinked from its css_set while pinning and pointing to the css_set. This keeps css_set association from task side alive while hiding it from "cgroup.procs" and populated_cnt tracking. The css_set reference is dropped when the task_struct is freed. * ->exit() callback no longer needs the css arguments as the associated css never changes once PF_EXITING is set. Removed. * cpu and perf_events controllers no longer need ->exit() callbacks. There's no reason to explicitly switch away on exit. The final schedule out is enough. The callbacks are removed. * On traditional hierarchies, nothing changes. "/proc/PID/cgroup" still reports "/" for all zombies. On the default hierarchy, "/proc/PID/cgroup" keeps reporting the cgroup that the task belonged to at the time of exit. If the cgroup gets removed before the task is reaped, " (deleted)" is appended. v2: Build brekage due to missing dummy cgroup_free() when !CONFIG_CGROUP fixed. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- include/linux/cgroup-defs.h | 4 +--- include/linux/cgroup.h | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 62413c3e2f4b..6a1ab64ee5f9 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -435,9 +435,7 @@ struct cgroup_subsys { int (*can_fork)(struct task_struct *task, void **priv_p); void (*cancel_fork)(struct task_struct *task, void *priv); void (*fork)(struct task_struct *task, void *priv); - void (*exit)(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task); + void (*exit)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); int early_init; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 46020735bcbb..22e3754f89c5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -102,6 +102,7 @@ extern void cgroup_cancel_fork(struct task_struct *p, extern void cgroup_post_fork(struct task_struct *p, void *old_ss_priv[CGROUP_CANFORK_COUNT]); void cgroup_exit(struct task_struct *p); +void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); @@ -547,6 +548,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p, static inline void cgroup_post_fork(struct task_struct *p, void *ss_priv[CGROUP_CANFORK_COUNT]) {} static inline void cgroup_exit(struct task_struct *p) {} +static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } -- cgit v1.2.3 From afcf6c8b75444382e0f9996157207ebae34a8848 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Oct 2015 16:41:53 -0400 Subject: cgroup: add cgroup_subsys->free() method and use it to fix pids controller pids controller is completely broken in that it uncharges when a task exits allowing zombies to escape resource control. With the recent updates, cgroup core now maintains cgroup association till task free and pids controller can be fixed by uncharging on free instead of exit. This patch adds cgroup_subsys->free() method and update pids controller to use it instead of ->exit() for uncharging. Signed-off-by: Tejun Heo Cc: Aleksa Sarai --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6a1ab64ee5f9..60d44b26276d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -436,6 +436,7 @@ struct cgroup_subsys { void (*cancel_fork)(struct task_struct *task, void *priv); void (*fork)(struct task_struct *task, void *priv); void (*exit)(struct task_struct *task); + void (*free)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); int early_init; -- cgit v1.2.3