From ea8fd3b47ff4ed4b1b5942bf3e0cb8d8f590ec59 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: cgroup_apply_cftypes() shouldn't skip the default hierarhcy cgroup_apply_cftypes() skip creating or removing files if the subsystem is attached to the default hierarchy, which led to missing files in the root of the default hierarchy. Skipping made sense when the default hierarchy was dummy; however, now that the default hierarchy is full functional and planned to be used as the unified hierarchy, it shouldn't be skipped over. Reported-by: Li Zefan Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11a03d67635a..a6894272353b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2436,10 +2436,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) lockdep_assert_held(&cgroup_tree_mutex); - /* don't bother if @ss isn't attached */ - if (ss->root == &cgrp_dfl_root) - return 0; - /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; -- cgit v1.2.3 From f392e51cd6ae6f6ee5b9b6d611cdc282b4c1711e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: update cgroup->subsys_mask to ->child_subsys_mask and restore cgroup_root->subsys_mask 944196278d3d ("cgroup: move ->subsys_mask from cgroupfs_root to cgroup") moved ->subsys_mask from cgroup_root to cgroup to prepare for the unified hierarhcy; however, it turns out that carrying the subsys_mask of the children in the parent, instead of itself, is a lot more natural. This patch restores cgroup_root->subsys_mask and morphs cgroup->subsys_mask into cgroup->child_subsys_mask. * Uses of root->cgrp.subsys_mask are restored to root->subsys_mask. * Remove automatic setting and clearing of cgrp->subsys_mask and instead just inherit ->child_subsys_mask from the parent during cgroup creation. Note that this doesn't affect any current behaviors. * Undo __kill_css() separation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 64 ++++++++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 37 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a6894272353b..f944619077f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -529,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->cgrp.subsys_mask & (1UL << i)) { + if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -742,7 +742,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); + rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -1050,8 +1050,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; - src_root->cgrp.subsys_mask &= ~(1 << ssid); - dst_root->cgrp.subsys_mask |= 1 << ssid; + src_root->subsys_mask &= ~(1 << ssid); + src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + + dst_root->subsys_mask |= 1 << ssid; + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -1069,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, int ssid; for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1273,12 +1276,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; - removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1535,7 +1538,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->cgrp.subsys_mask)) { + (opts.subsys_mask != root->subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -3658,8 +3661,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) cgroup_get(cgrp); css_get(css->parent); - cgrp->subsys_mask |= 1 << ss->id; - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -3780,13 +3781,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->cgrp.subsys_mask & (1 << ssid)) { + if (parent->child_subsys_mask & (1 << ssid)) { err = create_css(cgrp, ss); if (err) goto err_destroy; } } + cgrp->child_subsys_mask = parent->child_subsys_mask; + kernfs_activate(kn); mutex_unlock(&cgroup_mutex); @@ -3882,7 +3885,16 @@ static void css_killed_ref_fn(struct percpu_ref *ref) queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void __kill_css(struct cgroup_subsys_state *css) +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_tree_mutex); @@ -3911,28 +3923,6 @@ static void __kill_css(struct cgroup_subsys_state *css) percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } -/** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) -{ - struct cgroup *cgrp = css->cgroup; - - lockdep_assert_held(&cgroup_tree_mutex); - - /* if already killed, noop */ - if (cgrp->subsys_mask & (1 << css->ss->id)) { - cgrp->subsys_mask &= ~(1 << css->ss->id); - __kill_css(css); - } -} - /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4145,7 +4135,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; + cgrp_dfl_root.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4302,7 +4292,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", -- cgit v1.2.3 From aec3dfcb2e43892180ee053e8c260dcdeccf4392 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: introduce effective cgroup_subsys_state In the planned default unified hierarchy, controllers may get dynamically attached to and detached from a cgroup and a cgroup may not have csses for all the controllers associated with the hierarchy. When a cgroup doesn't have its own css for a given controller, the css of the nearest ancestor with the controller enabled will be used, which is called the effective css. This patch introduces cgroup_e_css() and for_each_e_css() to access the effective csses and convert compare_css_sets(), find_existing_css_set() and cgroup_migrate() to use the effective csses so that they can handle cgroups with partial csses correctly. This means that for two css_sets to be considered identical, they should have both matching csses and cgroups. compare_css_sets() already compares both, not for correctness but for optimization. As this now becomes a matter of correctness, update the comments accordingly. For all !default hierarchies, cgroup_e_css() always equals cgroup_css(), so this patch doesn't change behavior. While at it, fix incorrect locking comment for for_each_css(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 83 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 19 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f944619077f4..4eb2dd1bb5b1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -208,6 +208,34 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return &cgrp->dummy_css; } +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) + * + * Similar to cgroup_css() but returns the effctive css, which is defined + * as the matching css of the nearest ancestor including self which has @ss + * enabled. If @ss is associated with the hierarchy @cgrp is on, this + * function is guaranteed to return non-NULL css. + */ +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + lockdep_assert_held(&cgroup_mutex); + + if (!ss) + return &cgrp->dummy_css; + + if (!(cgrp->root->subsys_mask & (1 << ss->id))) + return NULL; + + while (cgrp->parent && + !(cgrp->parent->child_subsys_mask & (1 << ss->id))) + cgrp = cgrp->parent; + + return cgroup_css(cgrp, ss); +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -273,7 +301,7 @@ static int notify_on_release(const struct cgroup *cgrp) * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_mutex. + * Should be called under cgroup_[tree_]mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -283,6 +311,20 @@ static int notify_on_release(const struct cgroup *cgrp) lockdep_is_held(&cgroup_mutex)))) { } \ else +/** + * for_each_e_css - iterate all effective css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_[tree_]mutex. + */ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ + else + /** * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor @@ -452,20 +494,20 @@ static bool compare_css_sets(struct css_set *cset, { struct list_head *l1, *l2; - if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { - /* Not all subsystems matched */ + /* + * On the default hierarchy, there can be csets which are + * associated with the same set of cgroups but different csses. + * Let's first ensure that csses match. + */ + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; - } /* * Compare cgroup pointers in order to distinguish between - * different cgroups in heirarchies with no subsystems. We - * could get by with just this check alone (and skip the - * memcmp above) but on most setups the memcmp check will - * avoid the need for this more expensive check on almost all - * candidates. + * different cgroups in hierarchies. As different cgroups may + * share the same effective css, this comparison is always + * necessary. */ - l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { @@ -530,13 +572,16 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { - /* Subsystem is in this hierarchy. So we want - * the subsystem state from the new - * cgroup */ - template[i] = cgroup_css(cgrp, ss); + /* + * @ss is in this hierarchy, so we want the + * effective css from @cgrp. + */ + template[i] = cgroup_e_css(cgrp, ss); } else { - /* Subsystem is not in this hierarchy, so we - * don't want to change the subsystem state */ + /* + * @ss is not in this hierarchy, so we don't want + * to change the css. + */ template[i] = old_cset->subsys[i]; } } @@ -1969,7 +2014,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, return 0; /* check that we can legitimately attach to the cgroup */ - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css->ss->can_attach) { ret = css->ss->can_attach(css, &tset); if (ret) { @@ -1999,7 +2044,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, */ tset.csets = &tset.dst_csets; - for_each_css(css, i, cgrp) + for_each_e_css(css, i, cgrp) if (css->ss->attach) css->ss->attach(css, &tset); @@ -2007,7 +2052,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, goto out_release_tset; out_cancel_attach: - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css == failed_css) break; if (css->ss->cancel_attach) -- cgit v1.2.3 From 2d8f243a5e6efa57fb7c46fe83fafa45b33d0ec2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: implement cgroup->e_csets[] On the default unified hierarchy, a cgroup may be associated with csses of its ancestors, which means that a css of a given cgroup may be associated with css_sets of descendant cgroups. This means that we can't walk all tasks associated with a css by iterating the css_sets associated with the cgroup as there are css_sets which are pointing to the css but linked on the descendants. This patch adds per-subsystem list heads cgroup->e_csets[]. Any css_set which is pointing to a css is linked to css->cgroup->e_csets[$SUBSYS_ID] through css_set->e_cset_node[$SUBSYS_ID]. The lists are protected by css_set_rwsem and will allow us to walk all css_sets associated with a given css so that we can find out all associated tasks. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4eb2dd1bb5b1..37d966289978 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -425,6 +425,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; + struct cgroup_subsys *ss; + int ssid; lockdep_assert_held(&css_set_rwsem); @@ -432,6 +434,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) return; /* This css_set is dead. unlink it and release cgroup refcounts */ + for_each_subsys(ss, ssid) + list_del(&cset->e_cset_node[ssid]); hash_del(&cset->hlist); css_set_count--; @@ -673,7 +677,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; + struct cgroup_subsys *ss; unsigned long key; + int ssid; lockdep_assert_held(&cgroup_mutex); @@ -724,10 +730,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_set_count++; - /* Add this cgroup group to the hash table */ + /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); + for_each_subsys(ss, ssid) + list_add_tail(&cset->e_cset_node[ssid], + &cset->subsys[ssid]->cgroup->e_csets[ssid]); + up_write(&css_set_rwsem); return cset; @@ -1028,7 +1038,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { struct cgroup_subsys *ss; - int ssid, ret; + int ssid, i, ret; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); @@ -1081,6 +1091,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, for_each_subsys(ss, ssid) { struct cgroup_root *src_root; struct cgroup_subsys_state *css; + struct css_set *cset; if (!(ss_mask & (1 << ssid))) continue; @@ -1095,6 +1106,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + list_move_tail(&cset->e_cset_node[ss->id], + &dst_root->cgrp.e_csets[ss->id]); + up_write(&css_set_rwsem); + src_root->subsys_mask &= ~(1 << ssid); src_root->cgrp.child_subsys_mask &= ~(1 << ssid); @@ -1417,6 +1434,9 @@ out_unlock: static void init_cgroup_housekeeping(struct cgroup *cgrp) { + struct cgroup_subsys *ss; + int ssid; + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); @@ -1425,6 +1445,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; + + for_each_subsys(ss, ssid) + INIT_LIST_HEAD(&cgrp->e_csets[ssid]); } static void init_cgroup_root(struct cgroup_root *root, @@ -4249,6 +4272,9 @@ int __init cgroup_init(void) if (!ss->early_init) cgroup_init_subsys(ss); + list_add_tail(&init_css_set.e_cset_node[ssid], + &cgrp_dfl_root.cgrp.e_csets[ssid]); + /* * cftype registration needs kmalloc and can't be done * during early_init. Register base cftypes separately. -- cgit v1.2.3 From 3b281afbc3a06cd69c54e6db1a04a8e73997723f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: make css_next_child() skip missing csses css_next_child() walks the children of the specified css. It does this by finding the next cgroup and then returning the requested css. On the default unified hierarchy, a cgroup may not have a css associated with it even if the hierarchy has the subsystem enabled. This patch updates css_next_child() so that it skips children without the requested css associated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 37d966289978..0edc186cd545 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2708,10 +2708,19 @@ css_next_child(struct cgroup_subsys_state *pos_css, break; } - if (&next->sibling == &cgrp->children) - return NULL; + /* + * @next, if not pointing to the head, can be dereferenced and is + * the next sibling; however, it might have @ss disabled. If so, + * fast-forward to the next enabled one. + */ + while (&next->sibling != &cgrp->children) { + struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); - return cgroup_css(next, parent_css->ss); + if (next_css) + return next_css; + next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); + } + return NULL; } /** -- cgit v1.2.3 From 0f0a2b4fa6210147131082999f1f16d7fb79abf8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: reorganize css_task_iter This patch reorganizes css_task_iter so that adding effective css support is easier. * s/->cset_link/->cset_pos/ and s/->task/->task_pos/ for consistency * ->origin_css is used to determine whether the iteration reached the last css_set. Replace it with explicit ->cset_head so that css_advance_task_iter() doesn't have to know the termination condition directly. * css_task_iter_next() currently assumes that it's walking list of cgrp_cset_link and reaches into the current cset through the current link to determine the termination conditions for task walking. As this won't always be true for effective css walking, add ->tasks_head and ->mg_tasks_head and use them to control task walking so that css_task_iter_next() doesn't have to know how css_sets are being walked. This patch doesn't make any behavior changes. The iteration logic stays unchanged after the patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0edc186cd545..d48163b26196 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2857,27 +2857,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, */ static void css_advance_task_iter(struct css_task_iter *it) { - struct list_head *l = it->cset_link; + struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; struct css_set *cset; /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_css->cgroup->cset_links) { - it->cset_link = NULL; + if (l == it->cset_head) { + it->cset_pos = NULL; return; } link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); - it->cset_link = l; + it->cset_pos = l; if (!list_empty(&cset->tasks)) - it->task = cset->tasks.next; + it->task_pos = cset->tasks.next; else - it->task = cset->mg_tasks.next; + it->task_pos = cset->mg_tasks.next; + + it->tasks_head = &cset->tasks; + it->mg_tasks_head = &cset->mg_tasks; } /** @@ -2903,8 +2906,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->origin_css = css; - it->cset_link = &css->cgroup->cset_links; + it->cset_pos = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); } @@ -2920,12 +2923,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; - struct list_head *l = it->task; - struct cgrp_cset_link *link = list_entry(it->cset_link, - struct cgrp_cset_link, cset_link); + struct list_head *l = it->task_pos; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cset_link) + if (!it->cset_pos) return NULL; res = list_entry(l, struct task_struct, cg_list); @@ -2936,13 +2937,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ l = l->next; - if (l == &link->cset->tasks) - l = link->cset->mg_tasks.next; + if (l == it->tasks_head) + l = it->mg_tasks_head->next; - if (l == &link->cset->mg_tasks) + if (l == it->mg_tasks_head) css_advance_task_iter(it); else - it->task = l; + it->task_pos = l; return res; } -- cgit v1.2.3 From 3ebb2b6ef38875b866ec0118bfae7bc52afd0166 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: teach css_task_iter about effective csses Currently, css_task_iter iterates tasks associated with a css by visiting each css_set associated with the owning cgroup and walking tasks of each of them. This works fine for !unified hierarchies as each cgroup has its own css for each associated subsystem on the hierarchy; however, on the planned unified hierarchy, a cgroup may not have csses associated and its tasks would be considered associated with the matching css of the nearest ancestor which has the subsystem enabled. This means that on the default unified hierarchy, just walking all tasks associated with a cgroup isn't enough to walk all tasks which are associated with the specified css. If any of its children doesn't have the matching css enabled, task iteration should also include all tasks from the subtree. We already added cgroup->e_csets[] to list all css_sets effectively associated with a given css and walk css_sets on that list instead to achieve such iteration. This patch updates css_task_iter iteration such that it walks css_sets on cgroup->e_csets[] instead of cgroup->cset_links if iteration is requested on an non-dummy css. Thanks to the previous iteration update, this change can be achieved with the addition of css_task_iter->ss and minimal updates to css_advance_task_iter() and css_task_iter_start(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d48163b26196..ad28866ed44c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2868,8 +2868,14 @@ static void css_advance_task_iter(struct css_task_iter *it) it->cset_pos = NULL; return; } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; + + if (it->ss) { + cset = container_of(l, struct css_set, + e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); it->cset_pos = l; @@ -2906,7 +2912,13 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->cset_pos = &css->cgroup->cset_links; + it->ss = css->ss; + + if (it->ss) + it->cset_pos = &css->cgroup->e_csets[css->ss->id]; + else + it->cset_pos = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); -- cgit v1.2.3 From e32978031016f56be977a9a856ba4d9f447db51f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: cgroup->subsys[] should be cleared after the css is offlined After a css finishes offlining, offline_css() mistakenly performs RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css) which just sets the cgroup->subsys[] pointer to the current value. The intention was to clear it after offline is complete, not reassign the same value. Update it to assign NULL instead of the current value. This makes cgroup_css() to return NULL once offline is complete. All the existing users of the function either can handle NULL return already or guarantee that the css doesn't get offlined. While this is a bugfix, as css lifetime is currently tied to the cgroup it belongs to, this bug doesn't cause any actual problems. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad28866ed44c..83a8fff43d68 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3710,7 +3710,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); } /** -- cgit v1.2.3 From bd53d617b34c781dac8e22dbc75e8f182d918ecf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: allow cgroup creation and suppress automatic css creation in the unified hierarchy Now that effective css handling has been added and iterators updated accordingly, it's safe to allow cgroup creation in the default hierarchy. Unblock cgroup creation in the default hierarchy. As the default hierarchy will implement explicit enabling and disabling of controllers on each cgroup, suppress automatic css enabling on cgroup creation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 83a8fff43d68..2a4f88db3205 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1115,8 +1115,10 @@ static int rebind_subsystems(struct cgroup_root *dst_root, src_root->subsys_mask &= ~(1 << ssid); src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -3786,13 +3788,6 @@ static long cgroup_create(struct cgroup *parent, const char *name, struct cgroup_subsys *ss; struct kernfs_node *kn; - /* - * XXX: The default hierarchy isn't fully implemented yet. Block - * !root cgroup creation on it for now. - */ - if (root == &cgrp_dfl_root) - return -EINVAL; - /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) @@ -3878,7 +3873,12 @@ static long cgroup_create(struct cgroup *parent, const char *name, } } - cgrp->child_subsys_mask = parent->child_subsys_mask; + /* + * On the default hierarchy, a child doesn't automatically inherit + * child_subsys_mask from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->child_subsys_mask = parent->child_subsys_mask; kernfs_activate(kn); -- cgit v1.2.3 From 6803c006282768ec850760766a6e4eb1a6ff87df Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: add css_set->dfl_cgrp To implement the unified hierarchy behavior, we'll need to be able to determine the associated cgroup on the default hierarchy from css_set. Let's add css_set->dfl_cgrp so that it can be accessed conveniently and efficiently. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a4f88db3205..c66bfc8ee8a7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -651,6 +651,10 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); + + if (cgroup_on_dfl(cgrp)) + cset->dfl_cgrp = cgrp; + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; -- cgit v1.2.3 From 7fd8c565d8a501486d63d7ee07fd6582e97db437 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: update subsystem rebind restrictions Because the default root couldn't have any non-root csses attached to it, rebinding away from it was always allowed; however, the default hierarchy will soon host the unified hierarchy and have non-root csses so the rebind restrictions need to be updated accordingly. Instead of special casing rebinding from the default hierarchy and then checking whether the source hierarchy has children cgroups, which implies non-root csses for !dfl hierarchies, simply check whether the source hierarchy has non-root csses for the subsystem using css_next_child(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c66bfc8ee8a7..15eb2273d80b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1051,16 +1051,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (!(ss_mask & (1 << ssid))) continue; - /* if @ss is on the dummy_root, we can always move it */ - if (ss->root == &cgrp_dfl_root) - continue; - - /* if @ss has non-root cgroups attached to it, can't move */ - if (!list_empty(&ss->root->cgrp.children)) + /* if @ss has non-root csses attached to it, can't move */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; /* can't move between two non-dummy roots either */ - if (dst_root != &cgrp_dfl_root) + if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; } -- cgit v1.2.3 From f817de98513d060023be4fa1d061b29a6515273e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: prepare migration path for unified hierarchy Unified hierarchy implementation would require re-migrating tasks onto the same cgroup on the default hierarchy to reflect updated effective csses. Update cgroup_migrate_prepare_dst() so that it accepts NULL as the destination cgrp. When NULL is specified, the destination is considered to be the cgroup on the default hierarchy associated with each css_set. After this change, the identity check in cgroup_migrate_add_src() isn't sufficient for noop detection as the associated csses may change without any cgroup association changing. The only way to tell whether a migration is noop or not is testing whether the source and destination csets are identical. The noop check in cgroup_migrate_add_src() is removed and cset identity test is added to cgroup_migreate_prepare_dst(). If it's detected that source and destination csets are identical, the cset is removed removed from @preloaded_csets and all the migration nodes are cleared which makes cgroup_migrate() ignore the cset. Also, make the function append the destination css_sets to @preloaded_list so that destination css_sets always come after source css_sets. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15eb2273d80b..8c2835a9e192 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1902,10 +1902,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - /* nothing to do if this cset already belongs to the cgroup */ - if (src_cgrp == dst_cgrp) - return; - if (!list_empty(&src_cset->mg_preload_node)) return; @@ -1920,13 +1916,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup + * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * * Tasks are about to be moved to @dst_cgrp and all the source css_sets * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and put them on - * @preloaded_csets. + * pins all destination css_sets, links each to its source, and append them + * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each + * source css_set is assumed to be its cgroup on the default hierarchy. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed @@ -1937,19 +1934,34 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, struct list_head *preloaded_csets) { LIST_HEAD(csets); - struct css_set *src_cset; + struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, dst_cgrp); + dst_cset = find_css_set(src_cset, + dst_cgrp ?: src_cset->dfl_cgrp); if (!dst_cset) goto err; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + + /* + * If src cset equals dst, it's noop. Drop the src. + * cgroup_migrate() will skip the cset too. Note that we + * can't handle src == dst as some nodes are used by both. + */ + if (src_cset == dst_cset) { + src_cset->mg_src_cgrp = NULL; + list_del_init(&src_cset->mg_preload_node); + put_css_set(src_cset, false); + put_css_set(dst_cset, false); + continue; + } + src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) @@ -1958,7 +1970,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, put_css_set(dst_cset, false); } - list_splice(&csets, preloaded_csets); + list_splice_tail(&csets, preloaded_csets); return 0; err: cgroup_migrate_finish(&csets); -- cgit v1.2.3 From f8f22e53a262ebee37fc98004f16b066cf5bc125 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: implement dynamic subtree controller enable/disable on the default hierarchy cgroup is switching away from multiple hierarchies and will use one unified default hierarchy where controllers can be dynamically enabled and disabled per subtree. The default hierarchy will serve as the unified hierarchy to which all controllers are attached and a css on the default hierarchy would need to also serve the tasks of descendant cgroups which don't have the controller enabled - ie. the tree may be collapsed from leaf towards root when viewed from specific controllers. This has been implemented through effective css in the previous patches. This patch finally implements dynamic subtree controller enable/disable on the default hierarchy via a new knob - "cgroup.subtree_control" which controls which controllers are enabled on the child cgroups. Let's assume a hierarchy like the following. root - A - B - C \ D root's "cgroup.subtree_control" determines which controllers are enabled on A. A's on B. B's on C and D. This coincides with the fact that controllers on the immediate sub-level are used to distribute the resources of the parent. In fact, it's natural to assume that resource control knobs of a child belong to its parent. Enabling a controller in "cgroup.subtree_control" declares that distribution of the respective resources of the cgroup will be controlled. Note that this means that controller enable states are shared among siblings. The default hierarchy has an extra restriction - only cgroups which don't contain any task may have controllers enabled in "cgroup.subtree_control". Combined with the other properties of the default hierarchy, this guarantees that, from the view point of controllers, tasks are only on the leaf cgroups. In other words, only leaf csses may contain tasks. This rules out situations where child cgroups compete against internal tasks of the parent, which is a competition between two different types of entities without any clear way to determine resource distribution between the two. Different controllers handle it differently and all the implemented behaviors are ambiguous, ad-hoc, cumbersome and/or just wrong. Having this structural constraints imposed from cgroup core removes the burden from controller implementations and enables showing one consistent behavior across all controllers. When a controller is enabled or disabled, css associations for the controller in the subtrees of each child should be updated. After enabling, the whole subtree of a child should point to the new css of the child. After disabling, the whole subtree of a child should point to the cgroup's css. This is implemented by first updating cgroup states such that cgroup_e_css() result points to the appropriate css and then invoking cgroup_update_dfl_csses() which migrates all tasks in the affected subtrees to the self cgroup on the default hierarchy. * When read, "cgroup.subtree_control" lists all the currently enabled controllers on the children of the cgroup. * White-space separated list of controller names prefixed with either '+' or '-' can be written to "cgroup.subtree_control". The ones prefixed with '+' are enabled on the controller and '-' disabled. * A controller can be enabled iff the parent's "cgroup.subtree_control" enables it and disabled iff no child's "cgroup.subtree_control" has it enabled. * If a cgroup has tasks, no controller can be enabled via "cgroup.subtree_control". Likewise, if "cgroup.subtree_control" has some controllers enabled, tasks can't be migrated into the cgroup. * All controllers which aren't bound on other hierarchies are automatically associated with the root cgroup of the default hierarchy. All the controllers which are bound to the default hierarchy are listed in the read-only file "cgroup.controllers" in the root directory. * "cgroup.controllers" in all non-root cgroups is read-only file whose content is equal to that of "cgroup.subtree_control" of the parent. This indicates which controllers can be used in the cgroup's "cgroup.subtree_control". This is still experimental and there are some holes, one of which is that ->can_attach() failure during cgroup_update_dfl_csses() may leave the cgroups in an undefined state. The issues will be addressed by future patches. v2: Non-root cgroups now also have "cgroup.controllers". Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 367 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 365 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c2835a9e192..809dd903ceb8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -182,6 +182,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); @@ -338,6 +340,14 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->children, sibling) \ + if (({ lockdep_assert_held(&cgroup_tree_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1450,6 +1460,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); + + init_waitqueue_head(&cgrp->offline_waitq); } static void init_cgroup_root(struct cgroup_root *root, @@ -1938,6 +1950,14 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent && + dst_cgrp->child_subsys_mask) + return -EBUSY; + /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; @@ -2303,6 +2323,326 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +{ + struct cgroup_subsys *ss; + bool printed = false; + int ssid; + + for_each_subsys(ss, ssid) { + if (ss_mask & (1 << ssid)) { + if (printed) + seq_putc(seq, ' '); + seq_printf(seq, "%s", ss->name); + printed = true; + } + } + if (printed) + seq_putc(seq, '\n'); +} + +/* show controllers which are currently attached to the default hierarchy */ +static int cgroup_root_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask); + return 0; +} + +/* show controllers which are enabled from the parent */ +static int cgroup_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask); + return 0; +} + +/* show controllers which are enabled for a given cgroup's children */ +static int cgroup_subtree_control_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + return 0; +} + +/** + * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy + * @cgrp: root of the subtree to update csses for + * + * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * css associations need to be updated accordingly. This function looks up + * all css_sets which are attached to the subtree, creates the matching + * updated css_sets and migrates the tasks to the new ones. + */ +static int cgroup_update_dfl_csses(struct cgroup *cgrp) +{ + LIST_HEAD(preloaded_csets); + struct cgroup_subsys_state *css; + struct css_set *src_cset; + int ret; + + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); + + /* look up all csses currently attached to @cgrp's subtree */ + down_read(&css_set_rwsem); + css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + struct cgrp_cset_link *link; + + /* self is not affected by child_subsys_mask change */ + if (css->cgroup == cgrp) + continue; + + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, cgrp, + &preloaded_csets); + } + up_read(&css_set_rwsem); + + /* NULL dst indicates self on default hierarchy */ + ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + if (ret) + goto out_finish; + + list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + struct task_struct *last_task = NULL, *task; + + /* src_csets precede dst_csets, break on the first dst_cset */ + if (!src_cset->mg_src_cgrp) + break; + + /* + * All tasks in src_cset need to be migrated to the + * matching dst_cset. Empty it process by process. We + * walk tasks but migrate processes. The leader might even + * belong to a different cset but such src_cset would also + * be among the target src_csets because the default + * hierarchy enforces per-process membership. + */ + while (true) { + down_read(&css_set_rwsem); + task = list_first_entry_or_null(&src_cset->tasks, + struct task_struct, cg_list); + if (task) { + task = task->group_leader; + WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); + get_task_struct(task); + } + up_read(&css_set_rwsem); + + if (!task) + break; + + /* guard against possible infinite loop */ + if (WARN(last_task == task, + "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) + goto out_finish; + last_task = task; + + threadgroup_lock(task); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + threadgroup_unlock(task); + put_task_struct(task); + continue; + } + + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + + threadgroup_unlock(task); + put_task_struct(task); + + if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) + goto out_finish; + } + } + +out_finish: + cgroup_migrate_finish(&preloaded_csets); + return ret; +} + +/* change the enabled child controllers for a cgroup in the default hierarchy */ +static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, + struct cftype *cft, char *buffer) +{ + unsigned long enable_req = 0, disable_req = 0, enable, disable; + struct cgroup *cgrp = dummy_css->cgroup, *child; + struct cgroup_subsys *ss; + char *tok, *p; + int ssid, ret; + + /* + * Parse input - white space separated list of subsystem names + * prefixed with either + or -. + */ + p = buffer; + while ((tok = strsep(&p, " \t\n"))) { + for_each_subsys(ss, ssid) { + if (ss->disabled || strcmp(tok + 1, ss->name)) + continue; + + if (*tok == '+') { + enable_req |= 1 << ssid; + disable_req &= ~(1 << ssid); + } else if (*tok == '-') { + disable_req |= 1 << ssid; + enable_req &= ~(1 << ssid); + } else { + return -EINVAL; + } + break; + } + if (ssid == CGROUP_SUBSYS_COUNT) + return -EINVAL; + } + + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. cgroup_lock_live_group() already provides enough + * protection. Ensure @cgrp stays accessible and break the + * active_ref protection. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(cgrp->control_kn); +retry: + enable = enable_req; + disable = disable_req; + + mutex_lock(&cgroup_tree_mutex); + + for_each_subsys(ss, ssid) { + if (enable & (1 << ssid)) { + if (cgrp->child_subsys_mask & (1 << ssid)) { + enable &= ~(1 << ssid); + continue; + } + + /* + * Because css offlining is asynchronous, userland + * might try to re-enable the same controller while + * the previous instance is still around. In such + * cases, wait till it's gone using offline_waitq. + */ + cgroup_for_each_live_child(child, cgrp) { + wait_queue_t wait; + + if (!cgroup_css(child, ss)) + continue; + + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&cgroup_tree_mutex); + schedule(); + finish_wait(&child->offline_waitq, &wait); + goto retry; + } + + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgrp->parent && + !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock_tree; + } + } else if (disable & (1 << ssid)) { + if (!(cgrp->child_subsys_mask & (1 << ssid))) { + disable &= ~(1 << ssid); + continue; + } + + /* a child has it enabled? */ + cgroup_for_each_live_child(child, cgrp) { + if (child->child_subsys_mask & (1 << ssid)) { + ret = -EBUSY; + goto out_unlock_tree; + } + } + } + } + + if (!enable && !disable) { + ret = 0; + goto out_unlock_tree; + } + + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto out_unlock_tree; + } + + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Create csses for enables and update child_subsys_mask. This + * changes cgroup_e_css() results which in turn makes the + * subsequent cgroup_update_dfl_csses() associate all tasks in the + * subtree to the updated csses. + */ + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + ret = create_css(child, ss); + if (ret) + goto err_undo_css; + } + } + + cgrp->child_subsys_mask |= enable; + cgrp->child_subsys_mask &= ~disable; + + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + goto err_undo_css; + + /* all tasks are now migrated away from the old csses, kill them */ + for_each_subsys(ss, ssid) { + if (!(disable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) + kill_css(cgroup_css(child, ss)); + } + + kernfs_activate(cgrp->kn); + ret = 0; +out_unlock: + mutex_unlock(&cgroup_mutex); +out_unlock_tree: + mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(cgrp->control_kn); + cgroup_put(cgrp); + return ret; + +err_undo_css: + cgrp->child_subsys_mask &= ~enable; + cgrp->child_subsys_mask |= disable; + + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + if (css) + kill_css(css); + } + } + goto out_unlock; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2462,9 +2802,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); - if (ret) + if (ret) { kernfs_remove(kn); - return ret; + return ret; + } + + if (cft->seq_show == cgroup_subtree_control_show) + cgrp->control_kn = kn; + return 0; } /** @@ -3557,6 +3902,22 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_root_controllers_show, + }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_controllers_show, + }, + { + .name = "cgroup.subtree_control", + .flags = CFTYPE_ONLY_ON_DFL, + .seq_show = cgroup_subtree_control_show, + .write_string = cgroup_subtree_control_write, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and @@ -3725,6 +4086,8 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + + wake_up_all(&css->cgroup->offline_waitq); } /** -- cgit v1.2.3 From 842b597ee0a7e1aa5a3148164ffdba00ec17f614 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 25 Apr 2014 18:28:02 -0400 Subject: cgroup: implement cgroup.populated for the default hierarchy cgroup users often need a way to determine when a cgroup's subhierarchy becomes empty so that it can be cleaned up. cgroup currently provides release_agent for it; unfortunately, this mechanism is riddled with issues. * It delivers events by forking and execing a userland binary specified as the release_agent. This is a long deprecated method of notification delivery. It's extremely heavy, slow and cumbersome to integrate with larger infrastructure. * There is single monitoring point at the root. There's no way to delegate management of a subtree. * The event isn't recursive. It triggers when a cgroup doesn't have any tasks or child cgroups. Events for internal nodes trigger only after all children are removed. This again makes it impossible to delegate management of a subtree. * Events are filtered from the kernel side. "notify_on_release" file is used to subscribe to or suppress release event. This is unnecessarily complicated and probably done this way because event delivery itself was expensive. This patch implements interface file "cgroup.populated" which can be used to monitor whether the cgroup's subhierarchy has tasks in it or not. Its value is 0 if there is no task in the cgroup and its descendants; otherwise, 1, and kernfs_notify() notificaiton is triggers when the value changes, which can be monitored through poll and [di]notify. This is a lot ligther and simpler and trivially allows delegating management of subhierarchy - subhierarchy monitoring can block further propgation simply by putting itself or another process in the root of the subhierarchy and monitor events that it's interested in from there without interfering with monitoring higher in the tree. v2: Patch description updated as per Serge. v3: "cgroup.subtree_populated" renamed to "cgroup.populated". The subtree_ prefix was a bit confusing because "cgroup.subtree_control" uses it to denote the tree rooted at the cgroup sans the cgroup itself while the populated state includes the cgroup itself. Signed-off-by: Tejun Heo Acked-by: Serge Hallyn Acked-by: Li Zefan Cc: Lennart Poettering --- kernel/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 809dd903ceb8..0f986f7afee4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -411,6 +411,43 @@ static struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ +/** + * cgroup_update_populated - updated populated count of a cgroup + * @cgrp: the target cgroup + * @populated: inc or dec populated count + * + * @cgrp is either getting the first task (css_set) or losing the last. + * Update @cgrp->populated_cnt accordingly. The count is propagated + * towards root so that a given cgroup's populated_cnt is zero iff the + * cgroup and all its descendants are empty. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. + */ +static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +{ + lockdep_assert_held(&css_set_rwsem); + + do { + bool trigger; + + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; + + if (!trigger) + break; + + if (cgrp->populated_kn) + kernfs_notify(cgrp->populated_kn); + cgrp = cgrp->parent; + } while (cgrp); +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -456,10 +493,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) list_del(&link->cgrp_link); /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); + if (list_empty(&cgrp->cset_links)) { + cgroup_update_populated(cgrp, false); + if (notify_on_release(cgrp)) { + if (taskexit) + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } } kfree(link); @@ -668,7 +708,11 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; + + if (list_empty(&cgrp->cset_links)) + cgroup_update_populated(cgrp, true); list_move(&link->cset_link, &cgrp->cset_links); + /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation @@ -2643,6 +2687,12 @@ err_undo_css: goto out_unlock; } +static int cgroup_populated_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + return 0; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2809,6 +2859,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) if (cft->seq_show == cgroup_subtree_control_show) cgrp->control_kn = kn; + else if (cft->seq_show == cgroup_populated_show) + cgrp->populated_kn = kn; return 0; } @@ -3918,6 +3970,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_subtree_control_show, .write_string = cgroup_subtree_control_write, }, + { + .name = "cgroup.populated", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_populated_show, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and -- cgit v1.2.3 From 2f0edc04e702fc07d29621f9e361b9120a7594d0 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: clean up obsolete comment for parse_cgroupfs_options() 1d5be6b287c8efc87 ("cgroup: move module ref handling into rebind_subsystems()") makes parse_cgroupfs_options() no longer takes refcounts on subsystems. And unified hierachy makes parse_cgroupfs_options not need to call with cgroup_mutex held to protect the cgroup_subsys[]. So this patch removes BUG_ON() and the comment. As the comment doesn't contain useful information afterwards, the whole comment is removed. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f986f7afee4..fb848be0ea7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1221,12 +1221,6 @@ struct cgroup_sb_opts { bool none; }; -/* - * Convert a hierarchy specifier into a bitmask of subsystems and - * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] - * array. This function takes refcounts on subsystems to be used, unless it - * returns error, in which case no refcounts are taken. - */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; @@ -1235,8 +1229,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) struct cgroup_subsys *ss; int i; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - #ifdef CONFIG_CPUSETS mask = ~(1UL << cpuset_cgrp_id); #endif -- cgit v1.2.3 From f8719ccf7bc0858384c7e93d8c57fe69ae8c9eac Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: remove orphaned cgroup_pidlist_seq_operations 6612f05b88fa309c9 ("cgroup: unify pidlist and other file handling") has removed the only user of cgroup_pidlist_seq_operations : cgroup_pidlist_open(). This patch removes it. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fb848be0ea7b..3849d3d2dfe1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3880,17 +3880,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return seq_printf(s, "%d\n", *(int *)v); } -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { - .start = cgroup_pidlist_start, - .stop = cgroup_pidlist_stop, - .next = cgroup_pidlist_next, - .show = cgroup_pidlist_show, -}; - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { -- cgit v1.2.3 From a2a1f9eaf945c46b5b2bc0e439cba68888e3d540 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: replace pr_warning with preferred pr_warn As suggested by scripts/checkpatch.pl, substitude all pr_warning() with pr_warn(). No functional change. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3849d3d2dfe1..cb453e9954c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1126,9 +1126,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + pr_warn("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warn("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); } } @@ -1323,7 +1323,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + pr_warn("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || opts->cpuset_clone_children || opts->release_agent || @@ -1387,8 +1387,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) goto out_unlock; if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); + pr_warn("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); added_mask = opts.subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~opts.subsys_mask; @@ -1669,7 +1669,7 @@ retry: ret = -EINVAL; goto out_unlock; } else { - pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + pr_warn("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } @@ -4168,10 +4168,10 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); + pr_warn("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + pr_warn("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); ss->warned_broken_hierarchy = true; } -- cgit v1.2.3 From ed3d261b53f51c9505822d757d1800c79fb68788 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: Use more current logging style Use pr_fmt and remove embedded prefixes. Realign modified multi-line statements to open parenthesis. Convert embedded function name to "%s: ", __func__ Signed-off-by: Joe Perches Signed-off-by: Tejun Heo --- kernel/cgroup.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb453e9954c1..3873267c9ee3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -26,6 +26,8 @@ * distribution for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1126,9 +1128,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warn("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", ret, ss_mask); - pr_warn("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } } @@ -1323,12 +1325,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || opts->cpuset_clone_children || opts->release_agent || opts->name) { - pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } } else { @@ -1374,7 +1376,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) unsigned long added_mask, removed_mask; if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: remount is not allowed\n"); + pr_err("sane_behavior: remount is not allowed\n"); return -EINVAL; } @@ -1387,7 +1389,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) goto out_unlock; if (opts.subsys_mask != root->subsys_mask || opts.release_agent) - pr_warn("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = opts.subsys_mask & ~root->subsys_mask; @@ -1396,7 +1398,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { - pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + pr_err("option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; @@ -1665,11 +1667,11 @@ retry: if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + pr_err("sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; goto out_unlock; } else { - pr_warn("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); } } @@ -2889,8 +2891,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], if (is_add) { ret = cgroup_add_file(cgrp, cft); if (ret) { - pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, ret); + pr_warn("%s: failed to add %s, err=%d\n", + __func__, cft->name, ret); return ret; } } else { @@ -4168,10 +4170,10 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { - pr_warn("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) - pr_warn("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); ss->warned_broken_hierarchy = true; } -- cgit v1.2.3 From 69dfa00ccb72a37f3810687ca110e5a8154c6eed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup: make flags and subsys_masks unsigned int There's no reason to use atomic bitops for cgroup_subsys_state->flags, cgroup_root->flags and various subsys_masks. This patch updates those to use bitwise and/or operations instead and converts them form unsigned long to unsigned int. This makes the fields occupy (marginally) smaller space and makes it clear that they don't require atomicity. This patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3873267c9ee3..21667f396a1e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -181,7 +181,7 @@ static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); + unsigned int ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -963,7 +963,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; @@ -1079,7 +1079,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i; @@ -1087,15 +1087,14 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) for_each_subsys(ss, i) { struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; list_for_each_entry(cfts, &ss->cfts, node) cgroup_addrm_files(cgrp, cfts, false); } } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) { struct cgroup_subsys *ss; int ssid, i, ret; @@ -1128,7 +1127,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", ret, ss_mask); pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } @@ -1214,8 +1213,8 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; - unsigned long flags; + unsigned int subsys_mask; + unsigned int flags; char *release_agent; bool cpuset_clone_children; char *name; @@ -1227,12 +1226,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = (unsigned long)-1; + unsigned int mask = -1U; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_cgrp_id); + mask = ~(1U << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1313,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) return -EINVAL; - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); one_ss = true; break; @@ -1342,7 +1341,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) if (!ss->disabled) - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); /* * We either have to specify by name or by subsystems. (So @@ -1373,7 +1372,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + unsigned int added_mask, removed_mask; if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("sane_behavior: remount is not allowed\n"); @@ -1398,7 +1397,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; @@ -1522,7 +1521,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -2507,7 +2506,7 @@ out_finish: static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, struct cftype *cft, char *buffer) { - unsigned long enable_req = 0, disable_req = 0, enable, disable; + unsigned int enable_req = 0, disable_req = 0, enable, disable; struct cgroup *cgrp = dummy_css->cgroup, *child; struct cgroup_subsys *ss; char *tok, *p; @@ -3998,7 +3997,7 @@ static struct cftype cgroup_base_files[] = { * * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i, ret = 0; @@ -4007,7 +4006,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) for_each_subsys(ss, i) { struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; list_for_each_entry(cfts, &ss->cfts, node) { -- cgit v1.2.3 From 7d699ddb2b181a2c76e5ea18b1bdf102c4bebe4b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup, memcg: allocate cgroup ID from 1 Currently, cgroup->id is allocated from 0, which is always assigned to the root cgroup; unfortunately, memcg wants to use ID 0 to indicate invalid IDs and ends up incrementing all IDs by one. It's reasonable to reserve 0 for special purposes. This patch updates cgroup core so that ID 0 is not used and the root cgroups get ID 1. The ID incrementing is removed form memcg. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Acked-by: Li Zefan --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 21667f396a1e..3fa0463e74bb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1531,7 +1531,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); if (ret < 0) goto out; root_cgrp->id = ret; @@ -4225,7 +4225,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { err = -ENOMEM; goto err_unlock; -- cgit v1.2.3 From 6fa4918d03c39351aef3573ac3e7958d6a5ad9b6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup: protect cgroup_root->cgroup_idr with a spinlock Currently, cgroup_root->cgroup_idr is protected by cgroup_mutex, which ends up requiring cgroup_put() to be invoked under sleepable context. This is okay for now but is an unusual requirement and we'll soon add css->id which will have the same problem but won't be able to simply grab cgroup_mutex as removal will have to happen from css_release() which can't sleep. Introduce cgroup_idr_lock and idr_alloc/replace/remove() wrappers which protects the idr operations with the lock and use them for cgroup_root->cgroup_idr. cgroup_put() no longer needs to grab cgroup_mutex and css_from_id() is updated to always require RCU read lock instead of either RCU read lock or cgroup_mutex, which doesn't affect the existing users. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 51 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3fa0463e74bb..7cb9c0847445 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -99,6 +99,12 @@ static DEFINE_MUTEX(cgroup_mutex); static DECLARE_RWSEM(css_set_rwsem); #endif +/* + * Protects cgroup_idr so that IDs can be released without grabbing + * cgroup_mutex. + */ +static DEFINE_SPINLOCK(cgroup_idr_lock); + /* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. @@ -190,6 +196,37 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); +/* IDR wrappers which synchronize using cgroup_idr_lock */ +static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, + gfp_t gfp_mask) +{ + int ret; + + idr_preload(gfp_mask); + spin_lock(&cgroup_idr_lock); + ret = idr_alloc(idr, ptr, start, end, gfp_mask); + spin_unlock(&cgroup_idr_lock); + idr_preload_end(); + return ret; +} + +static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) +{ + void *ret; + + spin_lock(&cgroup_idr_lock); + ret = idr_replace(idr, ptr, id); + spin_unlock(&cgroup_idr_lock); + return ret; +} + +static void cgroup_idr_remove(struct idr *idr, int id) +{ + spin_lock(&cgroup_idr_lock); + idr_remove(idr, id); + spin_unlock(&cgroup_idr_lock); +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -1058,9 +1095,7 @@ static void cgroup_put(struct cgroup *cgrp) * per-subsystem and moved to css->id so that lookups are * successful until the target css is released. */ - mutex_lock(&cgroup_mutex); - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - mutex_unlock(&cgroup_mutex); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; call_rcu(&cgrp->rcu_head, cgroup_free_rcu); @@ -1531,7 +1566,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); + ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); if (ret < 0) goto out; root_cgrp->id = ret; @@ -4225,7 +4260,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); + cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { err = -ENOMEM; goto err_unlock; @@ -4268,7 +4303,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, * @cgrp is now fully operational. If something fails after this * point, it'll be released via the normal destruction path. */ - idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); err = cgroup_kn_set_ugid(kn); if (err) @@ -4302,7 +4337,7 @@ static long cgroup_create(struct cgroup *parent, const char *name, return 0; err_free_id: - idr_remove(&root->cgroup_idr, cgrp->id); + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); err_unlock: mutex_unlock(&cgroup_mutex); err_unlock_tree: @@ -5162,7 +5197,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - cgroup_assert_mutexes_or_rcu_locked(); + WARN_ON_ONCE(!rcu_read_lock_held()); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) -- cgit v1.2.3 From a2bed8209a3afc3b2cf1c28383fb48155c1fea46 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup: use RCU free in create_css() failure path Currently, when create_css() fails in the middle, the half-initialized css is freed by invoking cgroup_subsys->css_free() directly. This patch updates the function so that it invokes RCU free path instead. As the RCU free path puts the parent css and owning cgroup, their references are now acquired right after a new css is successfully allocated. This doesn't make any visible difference now but is to enable implementing css->id and RCU protected lookup by such IDs. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7cb9c0847445..0e2c401ed7b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4185,12 +4185,14 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (IS_ERR(css)) return PTR_ERR(css); + init_css(css, ss, cgrp); + cgroup_get(cgrp); + css_get(css->parent); + err = percpu_ref_init(&css->refcnt, css_release); if (err) goto err_free_css; - init_css(css, ss, cgrp); - err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) goto err_free_percpu_ref; @@ -4199,9 +4201,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_clear_dir; - cgroup_get(cgrp); - css_get(css->parent); - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -4218,7 +4217,7 @@ err_clear_dir: err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); err_free_css: - ss->css_free(css); + call_rcu(&css->rcu_head, css_free_rcu_fn); return err; } -- cgit v1.2.3 From ddfcadab35dda6e5bc23ccf1c3055ecb63a71e49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup: update init_css() into init_and_link_css() init_css() takes the cgroup the new css belongs to as an argument and initializes the new css's ->cgroup and ->parent pointers but doesn't acquire the matching reference counts. After the previous patch, create_css() puts init_css() and reference acquisition right next to each other. Let's move reference acquistion into init_css() and rename the function to init_and_link_css(). This makes sense and is easier to follow. This makes the root csses to hold a reference on cgrp_dfl_root.cgrp, which is harmless. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0e2c401ed7b9..f1c98c527b2d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4109,17 +4109,21 @@ static void css_release(struct percpu_ref *ref) call_rcu(&css->rcu_head, css_free_rcu_fn); } -static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_and_link_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, struct cgroup *cgrp) { + cgroup_get(cgrp); + css->cgroup = cgrp; css->ss = ss; css->flags = 0; - if (cgrp->parent) + if (cgrp->parent) { css->parent = cgroup_css(cgrp->parent, ss); - else + css_get(css->parent); + } else { css->flags |= CSS_ROOT; + } BUG_ON(cgroup_css(cgrp, ss)); } @@ -4185,9 +4189,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (IS_ERR(css)) return PTR_ERR(css); - init_css(css, ss, cgrp); - cgroup_get(cgrp); - css_get(css->parent); + init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release); if (err) @@ -4656,7 +4658,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, &cgrp_dfl_root.cgrp); + init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is -- cgit v1.2.3 From 15a4c835e4ed3e60dd68727cd1907e3dd89563f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup, memcg: implement css->id and convert css_from_id() to use it Until now, cgroup->id has been used to identify all the associated csses and css_from_id() takes cgroup ID and returns the matching css by looking up the cgroup and then dereferencing the css associated with it; however, now that the lifetimes of cgroup and css are separate, this is incorrect and breaks on the unified hierarchy when a controller is disabled and enabled back again before the previous instance is released. This patch adds css->id which is a subsystem-unique ID and converts css_from_id() to look up by the new css->id instead. memcg is the only user of css_from_id() and also converted to use css->id instead. For traditional hierarchies, this shouldn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jianyu Zhan Acked-by: Li Zefan --- kernel/cgroup.c | 59 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 21 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1c98c527b2d..a1a20e8c973a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -100,8 +100,8 @@ static DECLARE_RWSEM(css_set_rwsem); #endif /* - * Protects cgroup_idr so that IDs can be released without grabbing - * cgroup_mutex. + * Protects cgroup_idr and css_idr so that IDs can be released without + * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); @@ -1089,12 +1089,6 @@ static void cgroup_put(struct cgroup *cgrp) if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) return; - /* - * XXX: cgrp->id is only used to look up css's. As cgroup and - * css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4104,8 +4098,11 @@ static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); + struct cgroup_subsys *ss = css->ss; + + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + cgroup_idr_remove(&ss->css_idr, css->id); - RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4195,9 +4192,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_free_css; + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + if (err < 0) + goto err_free_percpu_ref; + css->id = err; + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) - goto err_free_percpu_ref; + goto err_free_id; + + /* @css is ready to be brought online now, make it visible */ + cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) @@ -4216,6 +4221,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) err_clear_dir: cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_id: + cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); err_free_css: @@ -4642,7 +4649,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .rename = cgroup_rename, }; -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) +static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; @@ -4651,6 +4658,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); + idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ @@ -4659,6 +4667,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + if (early) { + /* idr_alloc() can't be called safely during early init */ + css->id = 1; + } else { + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); + BUG_ON(css->id < 0); + } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4709,7 +4724,7 @@ int __init cgroup_init_early(void) ss->name = cgroup_subsys_name[i]; if (ss->early_init) - cgroup_init_subsys(ss); + cgroup_init_subsys(ss, true); } return 0; } @@ -4741,8 +4756,16 @@ int __init cgroup_init(void) mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { - if (!ss->early_init) - cgroup_init_subsys(ss); + if (ss->early_init) { + struct cgroup_subsys_state *css = + init_css_set.subsys[ss->id]; + + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, + GFP_KERNEL); + BUG_ON(css->id < 0); + } else { + cgroup_init_subsys(ss, false); + } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); @@ -5196,14 +5219,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { - struct cgroup *cgrp; - WARN_ON_ONCE(!rcu_read_lock_held()); - - cgrp = idr_find(&ss->root->cgroup_idr, id); - if (cgrp) - return cgroup_css(cgrp, ss); - return NULL; + return idr_find(&ss->css_idr, id); } #ifdef CONFIG_CGROUP_DEBUG -- cgit v1.2.3 From 60106946ca7f63396680130b25511ccf6b7d5bff Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 May 2014 20:08:13 +0200 Subject: kernel/cgroup.c: fix 2 kernel-doc warnings Fix typo and variable name. tj: Updated @cgrp argument description in cgroup_destroy_css_killed() Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a1a20e8c973a..07815ef7b1f6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1888,7 +1888,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) /** * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp; the cgroup @tsk is being migrated from + * @old_cgrp: the cgroup @tsk is being migrated from * @tsk: the task being migrated * @new_cset: the new css_set @tsk is being attached to * @@ -4586,7 +4586,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /** * cgroup_destroy_css_killed - the second step of cgroup destruction - * @work: cgroup->destroy_free_work + * @cgrp: the cgroup whose csses have just finished offlining * * This function is invoked from a work item for a cgroup which is being * destroyed after all css's are offlined and performs the rest of -- cgit v1.2.3 From 0cee8b7786467907e12d1d8f872e6dc73bc95204 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: fix offlining child waiting in cgroup_subtree_control_write() cgroup_subtree_control_write() waits for offline to complete child-by-child before enabling a controller; however, it has a couple bugs. * It doesn't initialize the wait_queue_t. This can lead to infinite hang on the following schedule() among other things. * It forgets to pin the child before releasing cgroup_tree_mutex and performing schedule(). The child may already be gone by the time it wakes up and invokes finish_wait(). Pin the child being waited on. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9db1a9629a5c..95fc66b16091 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2594,16 +2594,18 @@ retry: * cases, wait till it's gone using offline_waitq. */ cgroup_for_each_live_child(child, cgrp) { - wait_queue_t wait; + DEFINE_WAIT(wait); if (!cgroup_css(child, ss)) continue; + cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&cgroup_tree_mutex); schedule(); finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); goto retry; } -- cgit v1.2.3 From 54504e977ceee0bea6fbe8b632eceea771b18c6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: cgroup_idr_lock should be bh cgroup_idr_remove() can be invoked from bh leading to lockdep detecting possible AA deadlock (IN_BH/ON_BH). Make the lock bh-safe. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95fc66b16091..e2ff925e6ee8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -203,9 +203,9 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, int ret; idr_preload(gfp_mask); - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } @@ -214,17 +214,17 @@ static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { - spin_lock(&cgroup_idr_lock); + spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); - spin_unlock(&cgroup_idr_lock); + spin_unlock_bh(&cgroup_idr_lock); } /** -- cgit v1.2.3 From 0ab7a60dea71c285dfbb65e344d895b9c4f7bcb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: css_release() shouldn't clear cgroup->subsys[] c1a71504e971 ("cgroup: don't recycle cgroup id until all csses' have been destroyed") made cgroup ID persist until a cgroup is released and add cgroup->subsys[] clearing to css_release() so that css_from_id() doesn't return a css which has already been released which happens before cgroup release; however, the right change here was updating offline_css() to clear cgroup->subsys[] which was done by e32978031016 ("cgroup: cgroup->subsys[] should be cleared after the css is offlined") instead of clearing it from css_release(). We're now clearing cgroup->subsys[] twice. This is okay for traditional hierarchies as a css's lifetime is the same as its cgroup's; however, this confuses unified hierarchy and turning on and off a controller repeatedly using "cgroup.subtree_control" can lead to an oops like the following which happens because cgroup->subsys[] is incorrectly cleared asynchronously by css_release(). BUG: unable to handle kernel NULL pointer dereference at 00000000000000 08 IP: [] kill_css+0x21/0x1c0 PGD 1170d067 PUD f0ab067 PMD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 2 PID: 459 Comm: bash Not tainted 3.15.0-rc2-work+ #5 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff880009296710 ti: ffff88000e198000 task.ti: ffff88000e198000 RIP: 0010:[] [] kill_css+0x21/0x1c0 RSP: 0018:ffff88000e199dc8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 0000000000000001 RSI: ffffffff8238a968 RDI: ffff880009296f98 RBP: ffff88000e199de0 R08: 0000000000000001 R09: 02b0000000000000 R10: 0000000000000000 R11: ffff880009296fc0 R12: 0000000000000001 R13: ffff88000db6fc58 R14: 0000000000000001 R15: ffff8800139dcc00 FS: 00007ff9160c5740(0000) GS:ffff88001fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 0000000013947000 CR4: 00000000000006e0 Stack: ffff88000e199de0 ffffffff82389160 0000000000000001 ffff88000e199e80 ffffffff8113537f 0000000000000007 ffff88000e74af00 ffff88000e199e48 ffff880009296710 ffff88000db6fc00 ffffffff8239c100 0000000000000002 Call Trace: [] cgroup_subtree_control_write+0x85f/0xa00 [] cgroup_file_write+0x38/0x1d0 [] kernfs_fop_write+0xe7/0x170 [] vfs_write+0xb6/0x1c0 [] SyS_write+0x4d/0xc0 [] system_call_fastpath+0x16/0x1b Code: 5c 41 5d 41 5e 41 5f 5d c3 90 0f 1f 44 00 00 55 48 89 e5 41 54 53 48 89 fb 48 83 ec 08 8b 05 37 ad 29 01 85 c0 0f 85 df 00 00 00 <48> 8b 43 08 48 8b 3b be 01 00 00 00 8b 48 5c d3 e6 e8 49 ff ff RIP [] kill_css+0x21/0x1c0 RSP CR2: 0000000000000008 ---[ end trace e7aae1f877c4e1b4 ]--- Remove the unnecessary cgroup->subsys[] clearing from css_release(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2ff925e6ee8..35daf892b6e6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4102,7 +4102,6 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); struct cgroup_subsys *ss = css->ss; - RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); cgroup_idr_remove(&ss->css_idr, css->id); call_rcu(&css->rcu_head, css_free_rcu_fn); -- cgit v1.2.3 From d37167ab7b3d67d53519585a44c47416e6758ed2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:10:59 -0400 Subject: cgroup: update and fix parsing of "cgroup.subtree_control" I was confused that strsep() was equivalent to strtok_r() in skipping over consecutive delimiters. strsep() just splits at the first occurrence of one of the delimiters which makes the parsing very inflexible, which makes allowing multiple whitespace chars as delimters kinda moot. Let's just be consistently strict and require list of tokens separated by spaces. This is what Documentation/cgroups/unified-hierarchy.txt describes too. Also, parsing may access beyond the end of the string if the string ends with spaces or is zero-length. Make sure it skips zero-length tokens. Note that this also ensures that the parser doesn't puke on multiple consecutive spaces. v2: Add zero-length token skipping. v3: Added missing space after "==". Spotted by Li. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 35daf892b6e6..250def0694b4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2542,11 +2542,13 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, int ssid, ret; /* - * Parse input - white space separated list of subsystem names - * prefixed with either + or -. + * Parse input - space separated list of subsystem names prefixed + * with either + or -. */ p = buffer; - while ((tok = strsep(&p, " \t\n"))) { + while ((tok = strsep(&p, " "))) { + if (tok[0] == '\0') + continue; for_each_subsys(ss, ssid) { if (ss->disabled || strcmp(tok + 1, ss->name)) continue; -- cgit v1.2.3 From 7d331fa985d3a39d5b8cb60caf016d3e53e57c91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:00 -0400 Subject: cgroup: use restart_syscall() for retries after offline waits in cgroup_subtree_control_write() After waiting for a child to finish offline, cgroup_subtree_control_write() jumps up to retry from after the input parsing and active protection breaking. This retry makes the scheduled locking update - removal of cgroup_tree_mutex - more difficult. Let's simplify it by returning with restart_syscall() for retries. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250def0694b4..3251cc9070fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2535,7 +2535,7 @@ out_finish: static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, struct cftype *cft, char *buffer) { - unsigned int enable_req = 0, disable_req = 0, enable, disable; + unsigned int enable = 0, disable = 0; struct cgroup *cgrp = dummy_css->cgroup, *child; struct cgroup_subsys *ss; char *tok, *p; @@ -2554,11 +2554,11 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, continue; if (*tok == '+') { - enable_req |= 1 << ssid; - disable_req &= ~(1 << ssid); + enable |= 1 << ssid; + disable &= ~(1 << ssid); } else if (*tok == '-') { - disable_req |= 1 << ssid; - enable_req &= ~(1 << ssid); + disable |= 1 << ssid; + enable &= ~(1 << ssid); } else { return -EINVAL; } @@ -2576,9 +2576,6 @@ static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, */ cgroup_get(cgrp); kernfs_break_active_protection(cgrp->control_kn); -retry: - enable = enable_req; - disable = disable_req; mutex_lock(&cgroup_tree_mutex); @@ -2608,7 +2605,9 @@ retry: schedule(); finish_wait(&child->offline_waitq, &wait); cgroup_put(child); - goto retry; + + ret = restart_syscall(); + goto out_unbreak; } /* unavailable or not enabled on the parent? */ @@ -2692,6 +2691,7 @@ out_unlock: mutex_unlock(&cgroup_mutex); out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); +out_unbreak: kernfs_unbreak_active_protection(cgrp->control_kn); cgroup_put(cgrp); return ret; -- cgit v1.2.3 From 46cfeb043b04f5878154bea36714709d46028495 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:00 -0400 Subject: cgroup: use release_agent_path_lock in cgroup_release_agent_show() release_path is now protected by release_agent_path_lock to allow accessing it without grabbing cgroup_mutex; however, cgroup_release_agent_show() was still grabbing cgroup_mutex. Let's convert it to release_agent_path_lock so that we don't have to worry about this one for the planned locking updates. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3251cc9070fa..7633703e9baf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2373,11 +2373,10 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); - mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.2.3 From ec903c0c858e4963a9e0724bdcadfa837253341c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:01 -0400 Subject: cgroup: rename css_tryget*() to css_tryget_online*() Unlike the more usual refcnting, what css_tryget() provides is the distinction between online and offline csses instead of protection against upping a refcnt which already reached zero. cgroup is planning to provide actual tryget which fails if the refcnt already reached zero. Let's rename the existing trygets so that they clearly indicate that they're onliness. I thought about keeping the existing names as-are and introducing new names for the planned actual tryget; however, given that each controller participates in the synchronization of the online state, it seems worthwhile to make it explicit that these functions are about on/offline state. Rename css_tryget() to css_tryget_online() and css_tryget_from_dir() to css_tryget_online_from_dir(). This is pure rename. v2: cgroup_freezer grew new usages of css_tryget(). Update accordingly. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Vivek Goyal Cc: Jens Axboe Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/cgroup.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7633703e9baf..671d8a6dae37 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3771,7 +3771,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) /* * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); @@ -4060,9 +4060,9 @@ err: * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs - * and thus css_tryget() is guaranteed to fail, the css can be offlined - * by invoking offline_css(). After offlining, the base ref is put. - * Implemented in css_killed_work_fn(). + * and thus css_tryget_online() is guaranteed to fail, the css can be + * offlined by invoking offline_css(). After offlining, the base ref is + * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the @@ -4386,7 +4386,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. */ static void css_killed_work_fn(struct work_struct *work) { @@ -4398,8 +4398,8 @@ static void css_killed_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. + * css_tryget_online() is guaranteed to fail now. Tell subsystems + * to initate destruction. */ offline_css(css); @@ -4440,8 +4440,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. + * asynchronously once css_tryget_online() is guaranteed to fail and when + * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { @@ -4462,7 +4462,7 @@ static void kill_css(struct cgroup_subsys_state *css) /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via - * css_tryget(). We can't simply call percpu_ref_kill() and + * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * @@ -4478,9 +4478,9 @@ static void kill_css(struct cgroup_subsys_state *css) * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget() won't succeed by the time ->css_offline() is - * invoked. To satisfy all the requirements, destruction is implemented in - * the following two steps. + * guarantee that css_tryget_online() won't succeed by the time + * ->css_offline() is invoked. To satisfy all the requirements, + * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of @@ -4574,9 +4574,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * There are two control paths which try to determine cgroup from * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_from_dir(). Those are supported by RCU protecting - * clearing of cgrp->kn->priv backpointer, which should happen - * after all files under it have been removed. + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. */ kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); @@ -5173,7 +5173,7 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * @@ -5181,8 +5181,8 @@ __setup("cgroup_disable=", cgroup_disable); * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup_subsys_state *css = NULL; @@ -5204,7 +5204,7 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, if (cgrp) css = cgroup_css(cgrp, ss); - if (!css || !css_tryget(css)) + if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); -- cgit v1.2.3 From b41686401e501430ffe93b575ef7959d2ecc6f2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: implement cftype->write() During the recent conversion to kernfs, cftype's seq_file operations are updated so that they are directly mapped to kernfs operations and thus can fully access the associated kernfs and cgroup contexts; however, write path hasn't seen similar updates and none of the existing write operations has access to, for example, the associated kernfs_open_file. Let's introduce a new operation cftype->write() which maps directly to the kernfs write operation and has access to all the arguments and contexts. This will replace ->write_string() and ->trigger() and ease manipulation of kernfs active protection from cgroup file operations. Two accessors - of_cft() and of_css() - are introduced to enable accessing the associated cgroup context from cftype->write() which only takes kernfs_open_file for the context information. The accessors for seq_file operations - seq_cft() and seq_css() - are rewritten to wrap the of_ accessors. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 671d8a6dae37..a16f91d12f4e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -283,11 +283,10 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } -struct cgroup_subsys_state *seq_css(struct seq_file *seq) +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct kernfs_open_file *of = seq->private; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = seq_cft(seq); + struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). @@ -302,7 +301,7 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq) else return &cgrp->dummy_css; } -EXPORT_SYMBOL_GPL(seq_css); +EXPORT_SYMBOL_GPL(of_css); /** * cgroup_is_descendant - test ancestry @@ -1035,8 +1034,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write || + cft->write_string || cft->trigger) mode |= S_IWUSR; return mode; @@ -2726,6 +2725,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (cft->write) + return cft->write(of, buf, nbytes, off); + /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and -- cgit v1.2.3 From 451af504df0c62f695a69b83c250486e77c66378 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: replace cftype->write_string() with cftype->write() Convert all cftype->write_string() users to the new cftype->write() which maps directly to kernfs write operation and has full access to kernfs and cgroup contexts. The conversions are mostly mechanical. * @css and @cft are accessed using of_css() and of_cft() accessors respectively instead of being specified as arguments. * Should return @nbytes on success instead of 0. * @buf is not trimmed automatically. Trim if necessary. Note that blkcg and netprio don't need this as the parsers already handle whitespaces. cftype->write_string() has no user left after the conversions and removed. While at it, remove unnecessary local variable @p in cgroup_subtree_control_write() and stale comment about CGROUP_LOCAL_BUFFER_SIZE in cgroup_freezer.c. This patch doesn't introduce any visible behavior changes. v2: netprio was missing from conversion. Converted. Signed-off-by: Tejun Heo Acked-by: Aristeu Rozanski Acked-by: Vivek Goyal Acked-by: Li Zefan Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Neil Horman Cc: "David S. Miller" --- kernel/cgroup.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a16f91d12f4e..2a88ce7b24b6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1035,7 +1035,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write || - cft->write_string || cft->trigger) + cft->trigger) mode |= S_IWUSR; return mode; @@ -2352,20 +2352,21 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cgroup_root *root = css->cgroup->root; + struct cgroup *cgrp = of_css(of)->cgroup; + struct cgroup_root *root = cgrp->root; BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(css->cgroup)) + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(root->release_agent_path, buffer, + strlcpy(root->release_agent_path, strstrip(buf), sizeof(root->release_agent_path)); spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_mutex); - return 0; + return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) @@ -2530,21 +2531,22 @@ out_finish: } /* change the enabled child controllers for a cgroup in the default hierarchy */ -static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) { unsigned int enable = 0, disable = 0; - struct cgroup *cgrp = dummy_css->cgroup, *child; + struct cgroup *cgrp = of_css(of)->cgroup, *child; struct cgroup_subsys *ss; - char *tok, *p; + char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ - p = buffer; - while ((tok = strsep(&p, " "))) { + buf = strstrip(buf); + while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; for_each_subsys(ss, ssid) { @@ -2692,7 +2694,7 @@ out_unlock_tree: out_unbreak: kernfs_unbreak_active_protection(cgrp->control_kn); cgroup_put(cgrp); - return ret; + return ret ?: nbytes; err_undo_css: cgrp->child_subsys_mask &= ~enable; @@ -2738,9 +2740,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); - if (cft->write_string) { - ret = cft->write_string(css, cft, strstrip(buf)); - } else if (cft->write_u64) { + if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) @@ -3984,7 +3984,7 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.subtree_control", .flags = CFTYPE_ONLY_ON_DFL, .seq_show = cgroup_subtree_control_show, - .write_string = cgroup_subtree_control_write, + .write = cgroup_subtree_control_write, }, { .name = "cgroup.populated", @@ -4018,7 +4018,7 @@ static struct cftype cgroup_base_files[] = { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, + .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ -- cgit v1.2.3 From 6770c64e5c8da4705d1f0973bdeb5c2bf4f3a404 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: replace cftype->trigger() with cftype->write() cftype->trigger() is pointless. It's trivial to ignore the input buffer from a regular ->write() operation. Convert all ->trigger() users to ->write() and remove ->trigger(). This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko --- kernel/cgroup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a88ce7b24b6..2f16aab03493 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1034,8 +1034,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write || - cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write) mode |= S_IWUSR; return mode; @@ -2750,8 +2749,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); - } else if (cft->trigger) { - ret = cft->trigger(css, (unsigned int)cft->private); } else { ret = -EINVAL; } -- cgit v1.2.3 From acbef755f40e204b8a6503fa79958d51a898762a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:22 -0400 Subject: cgroup: convert "tasks" and "cgroup.procs" handle to use cftype->write() cgroup_tasks_write() and cgroup_procs_write() are currently using cftype->write_u64(). This patch converts them to use cftype->write() instead. This allows access to the associated kernfs_open_file which will be necessary to implement the planned kernfs active protection manipulation for these files. This shifts buffer parsing to attach_task_by_pid() and makes it return @nbytes on success. Let's rename it to __cgroup_procs_write() to clearly indicate that this is a write handler implementation. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2f16aab03493..9a48c117ebf1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2232,12 +2232,18 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, * function to attach either it or all tasks in its threadgroup. Will lock * cgroup_mutex and threadgroup. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; + struct cgroup *cgrp = of_css(of)->cgroup; + pid_t pid; int ret; + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2305,7 +2311,7 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: mutex_unlock(&cgroup_mutex); - return ret; + return ret ?: nbytes; } /** @@ -2339,16 +2345,16 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 pid) +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, pid, false); + return __cgroup_procs_write(of, buf, nbytes, off, false); } -static int cgroup_procs_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 tgid) +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, tgid, true); + return __cgroup_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, @@ -3953,7 +3959,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write_u64 = cgroup_procs_write, + .write = cgroup_procs_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -4002,7 +4008,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write_u64 = cgroup_tasks_write, + .write = cgroup_tasks_write, .mode = S_IRUGO | S_IWUSR, }, { -- cgit v1.2.3 From b7fc5ad235936379fae67a9f7b50bb53487a1a3a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:22 -0400 Subject: cgroup: remove cgroup->control_kn Now that cgroup_subtree_control_write() has access to the associated kernfs_open_file and thus the kernfs_node, there's no need to cache it in cgroup->control_kn on creation. Remove cgroup->control_kn and use @of->kn directly. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9a48c117ebf1..94d259bcd2b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2580,7 +2580,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * active_ref protection. */ cgroup_get(cgrp); - kernfs_break_active_protection(cgrp->control_kn); + kernfs_break_active_protection(of->kn); mutex_lock(&cgroup_tree_mutex); @@ -2697,7 +2697,7 @@ out_unlock: out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); out_unbreak: - kernfs_unbreak_active_protection(cgrp->control_kn); + kernfs_unbreak_active_protection(of->kn); cgroup_put(cgrp); return ret ?: nbytes; @@ -2887,9 +2887,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return ret; } - if (cft->seq_show == cgroup_subtree_control_show) - cgrp->control_kn = kn; - else if (cft->seq_show == cgroup_populated_show) + if (cft->seq_show == cgroup_populated_show) cgrp->populated_kn = kn; return 0; } -- cgit v1.2.3 From ba0f4d761503bd9ce4c7458b56bfd7c3fdb51e86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: reorganize cgroup_create() Reorganize cgroup_create() so that all paths share unlock out path. * All err_* labels are renamed to out_* as they're now shared by both success and failure paths. * @err renamed to @ret for the similar reason as above and so that it's more consistent with other functions. * cgroup memory allocation moved after locking so that freeing failed cgroup happens before unlocking. While this moves more code inside critical section, memory allocations inside cgroup locking are already pretty common and this is unlikely to make any noticeable difference. * While at it, replace a stray @parent->root dereference with @root. This reorganization will help simplifying locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 69 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 94d259bcd2b9..1d6106c3fb4e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4246,15 +4246,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, { struct cgroup *cgrp; struct cgroup_root *root = parent->root; - int ssid, err; + int ssid, ret; struct cgroup_subsys *ss; struct kernfs_node *kn; - /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) - return -ENOMEM; - mutex_lock(&cgroup_tree_mutex); /* @@ -4265,8 +4260,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, * don't get nasty surprises if we ever grow another caller. */ if (!cgroup_lock_live_group(parent)) { - err = -ENODEV; - goto err_unlock_tree; + ret = -ENODEV; + goto out_unlock_tree; + } + + /* allocate the cgroup and its ID, 0 is reserved for the root */ + cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + if (!cgrp) { + ret = -ENOMEM; + goto out_unlock; } /* @@ -4275,15 +4277,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { - err = -ENOMEM; - goto err_unlock; + ret = -ENOMEM; + goto out_free_cgrp; } init_cgroup_housekeeping(cgrp); cgrp->parent = parent; cgrp->dummy_css.parent = &parent->dummy_css; - cgrp->root = parent->root; + cgrp->root = root; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4294,8 +4296,8 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { - err = PTR_ERR(kn); - goto err_free_id; + ret = PTR_ERR(kn); + goto out_free_id; } cgrp->kn = kn; @@ -4318,20 +4320,20 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_kn_set_ugid(kn); - if (err) - goto err_destroy; + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; - err = cgroup_addrm_files(cgrp, cgroup_base_files, true); - if (err) - goto err_destroy; + ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (ret) + goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { if (parent->child_subsys_mask & (1 << ssid)) { - err = create_css(cgrp, ss); - if (err) - goto err_destroy; + ret = create_css(cgrp, ss); + if (ret) + goto out_destroy; } } @@ -4344,25 +4346,22 @@ static long cgroup_create(struct cgroup *parent, const char *name, kernfs_activate(kn); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - - return 0; + ret = 0; + goto out_unlock; -err_free_id: +out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); -err_unlock: +out_free_cgrp: + kfree(cgrp); +out_unlock: mutex_unlock(&cgroup_mutex); -err_unlock_tree: +out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); - kfree(cgrp); - return err; + return ret; -err_destroy: +out_destroy: cgroup_destroy_locked(cgrp); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - return err; + goto out_unlock; } static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, -- cgit v1.2.3 From b3bfd983ca94cf1393accc11e90123c83909babb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: collapse cgroup_create() into croup_mkdir() cgroup_mkdir() is the sole user of cgroup_create(). Let's collapse the latter into the former. This will help simplifying locking. While at it, remove now stale comment about inode locking. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 +++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 39 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d6106c3fb4e..580d3484f97a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4235,30 +4235,24 @@ err_free_css: return err; } -/** - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @name: name of the new cgroup - * @mode: mode to set on new cgroup - */ -static long cgroup_create(struct cgroup *parent, const char *name, - umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *cgrp; + struct cgroup *parent = parent_kn->priv, *cgrp; struct cgroup_root *root = parent->root; - int ssid, ret; struct cgroup_subsys *ss; struct kernfs_node *kn; - - mutex_lock(&cgroup_tree_mutex); + int ssid, ret; /* - * Only live parents can have children. Note that the liveliness - * check isn't strictly necessary because cgroup_mkdir() and - * cgroup_rmdir() are fully synchronized by i_mutex; however, do it - * anyway so that locking is contained inside cgroup proper and we - * don't get nasty surprises if we ever grow another caller. + * cgroup_mkdir() grabs cgroup_tree_mutex which nests outside + * kernfs active_ref and cgroup_create() already synchronizes + * properly against removal through cgroup_lock_live_group(). + * Break it before calling cgroup_create(). */ + cgroup_get(parent); + kernfs_break_active_protection(parent_kn); + mutex_lock(&cgroup_tree_mutex); if (!cgroup_lock_live_group(parent)) { ret = -ENODEV; goto out_unlock_tree; @@ -4357,6 +4351,8 @@ out_unlock: mutex_unlock(&cgroup_mutex); out_unlock_tree: mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(parent_kn); + cgroup_put(parent); return ret; out_destroy: @@ -4364,28 +4360,6 @@ out_destroy: goto out_unlock; } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - struct cgroup *parent = parent_kn->priv; - int ret; - - /* - * cgroup_create() grabs cgroup_tree_mutex which nests outside - * kernfs active_ref and cgroup_create() already synchronizes - * properly against removal through cgroup_lock_live_group(). - * Break it before calling cgroup_create(). - */ - cgroup_get(parent); - kernfs_break_active_protection(parent_kn); - - ret = cgroup_create(parent, name, mode); - - kernfs_unbreak_active_protection(parent_kn); - cgroup_put(parent); - return ret; -} - /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. -- cgit v1.2.3 From ddab2b6e0e516098efe6d3b69585bb25b1408d61 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: grab cgroup_mutex earlier in cgroup_subtree_control_write() Move cgroup_lock_live_group() invocation upwards to right below cgroup_tree_mutex in cgroup_subtree_control_write(). This is to help the planned locking simplification. This doesn't make any userland-visible behavioral changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 580d3484f97a..8afddb1a1c6c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2583,6 +2583,10 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); mutex_lock(&cgroup_tree_mutex); + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto out_unlock_tree; + } for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { @@ -2606,6 +2610,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); schedule(); finish_wait(&child->offline_waitq, &wait); @@ -2620,7 +2625,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, (cgrp->parent && !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { ret = -ENOENT; - goto out_unlock_tree; + goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->child_subsys_mask & (1 << ssid))) { @@ -2632,7 +2637,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_for_each_live_child(child, cgrp) { if (child->child_subsys_mask & (1 << ssid)) { ret = -EBUSY; - goto out_unlock_tree; + goto out_unlock; } } } @@ -2640,12 +2645,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (!enable && !disable) { ret = 0; - goto out_unlock_tree; - } - - if (!cgroup_lock_live_group(cgrp)) { - ret = -ENODEV; - goto out_unlock_tree; + goto out_unlock; } /* -- cgit v1.2.3 From cfc79d5bec04cdf26cd207d3e73d8bd59fd780a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: move cgroup->kn->priv clearing to cgroup_rmdir() The ->priv field of a cgroup directory kernfs_node points back to the cgroup. This field is RCU cleared in cgroup_destroy_locked() for non-kernfs accesses from css_tryget_from_dir() and cgroupstats_build(). As these are only applicable to cgroups which finished creation successfully and fully initialized cgroups are always removed by cgroup_rmdir(), this can be safely moved to the end of cgroup_rmdir(). This will help simplifying cgroup locking and shouldn't introduce any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8afddb1a1c6c..b49e63d5386b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4546,17 +4546,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* remove @cgrp directory along with the base files */ mutex_unlock(&cgroup_mutex); - - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_online_from_dir(). Those are supported by RCU - * protecting clearing of cgrp->kn->priv backpointer, which should - * happen after all files under it have been removed. - */ kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - mutex_lock(&cgroup_mutex); return 0; @@ -4615,6 +4605,17 @@ static int cgroup_rmdir(struct kernfs_node *kn) mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); + + /* + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. + */ + if (!ret) + RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); + cgroup_put(cgrp); return ret; } @@ -5174,7 +5175,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See destroy_locked() for details. + * protected for this access. See cgroup_rmdir() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) -- cgit v1.2.3 From a9746d8da786bc79b3b4ae1baa0fbbc4b795c1b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:22 -0400 Subject: cgroup: factor out cgroup_kn_lock_live() and cgroup_kn_unlock() cgroup_mkdir(), cgroup_rmdir() and cgroup_subtree_control_write() share the logic to break active protection so that they can grab cgroup_tree_mutex which nests above active protection and/or remove self. Factor out this logic into cgroup_kn_lock_live() and cgroup_kn_unlock(). This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 157 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 90 insertions(+), 67 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b49e63d5386b..21739e481006 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1093,6 +1093,75 @@ static void cgroup_put(struct cgroup *cgrp) call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } +/** + * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper undoes cgroup_kn_lock_live() and should be invoked before + * the method finishes if locking succeeded. Note that once this function + * returns the cgroup returned by cgroup_kn_lock_live() may become + * inaccessible any time. If the caller intends to continue to access the + * cgroup, it should pin it before invoking this function. + */ +static void cgroup_kn_unlock(struct kernfs_node *kn) +{ + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); +} + +/** + * cgroup_kn_lock_live - locking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper is to be used by a cgroup kernfs method currently servicing + * @kn. It breaks the active protection, performs cgroup locking and + * verifies that the associated cgroup is alive. Returns the cgroup if + * alive; otherwise, %NULL. A successful return should be undone by a + * matching cgroup_kn_unlock() invocation. + * + * Any cgroup kernfs method implementation which requires locking the + * associated cgroup should use this helper. It avoids nesting cgroup + * locking under kernfs active protection and allows all kernfs operations + * including self-removal. + */ +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. cgroup liveliness check alone provides enough + * protection against removal. Ensure @cgrp stays accessible and + * break the active_ref protection. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); + + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + if (!cgroup_is_dead(cgrp)) + return cgrp; + + cgroup_kn_unlock(kn); + return NULL; +} + static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; @@ -2541,7 +2610,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; - struct cgroup *cgrp = of_css(of)->cgroup, *child; + struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; @@ -2573,20 +2642,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } - /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs - * active_ref. cgroup_lock_live_group() already provides enough - * protection. Ensure @cgrp stays accessible and break the - * active_ref protection. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(of->kn); - - mutex_lock(&cgroup_tree_mutex); - if (!cgroup_lock_live_group(cgrp)) { - ret = -ENODEV; - goto out_unlock_tree; - } + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) + return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { @@ -2610,14 +2668,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgroup_get(child); prepare_to_wait(&child->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + cgroup_kn_unlock(of->kn); schedule(); finish_wait(&child->offline_waitq, &wait); cgroup_put(child); - ret = restart_syscall(); - goto out_unbreak; + return restart_syscall(); } /* unavailable or not enabled on the parent? */ @@ -2693,12 +2749,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, kernfs_activate(cgrp->kn); ret = 0; out_unlock: - mutex_unlock(&cgroup_mutex); -out_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); -out_unbreak: - kernfs_unbreak_active_protection(of->kn); - cgroup_put(cgrp); + cgroup_kn_unlock(of->kn); return ret ?: nbytes; err_undo_css: @@ -4238,25 +4289,16 @@ err_free_css: static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { - struct cgroup *parent = parent_kn->priv, *cgrp; - struct cgroup_root *root = parent->root; + struct cgroup *parent, *cgrp; + struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; int ssid, ret; - /* - * cgroup_mkdir() grabs cgroup_tree_mutex which nests outside - * kernfs active_ref and cgroup_create() already synchronizes - * properly against removal through cgroup_lock_live_group(). - * Break it before calling cgroup_create(). - */ - cgroup_get(parent); - kernfs_break_active_protection(parent_kn); - mutex_lock(&cgroup_tree_mutex); - if (!cgroup_lock_live_group(parent)) { - ret = -ENODEV; - goto out_unlock_tree; - } + parent = cgroup_kn_lock_live(parent_kn); + if (!parent) + return -ENODEV; + root = parent->root; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); @@ -4348,11 +4390,7 @@ out_free_id: out_free_cgrp: kfree(cgrp); out_unlock: - mutex_unlock(&cgroup_mutex); -out_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); - kernfs_unbreak_active_protection(parent_kn); - cgroup_put(parent); + cgroup_kn_unlock(parent_kn); return ret; out_destroy: @@ -4579,32 +4617,17 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) static int cgroup_rmdir(struct kernfs_node *kn) { - struct cgroup *cgrp = kn->priv; + struct cgroup *cgrp; int ret = 0; - /* - * This is self-destruction but @kn can't be removed while this - * callback is in progress. Let's break active protection. Once - * the protection is broken, @cgrp can be destroyed at any point. - * Pin it so that it stays accessible. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(kn); - - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - - /* - * @cgrp might already have been destroyed while we're trying to - * grab the mutexes. - */ - if (!cgroup_is_dead(cgrp)) - ret = cgroup_destroy_locked(cgrp); + cgrp = cgroup_kn_lock_live(kn); + if (!cgrp) + return 0; + cgroup_get(cgrp); /* for @kn->priv clearing */ - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + ret = cgroup_destroy_locked(cgrp); - kernfs_unbreak_active_protection(kn); + cgroup_kn_unlock(kn); /* * There are two control paths which try to determine cgroup from -- cgit v1.2.3 From e76ecaeef65c497153ceacf59c2e21c070d43f64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: use cgroup_kn_lock_live() in other cgroup kernfs methods Make __cgroup_procs_write() and cgroup_release_agent_write() use cgroup_kn_lock_live() and cgroup_kn_unlock() instead of cgroup_lock_live_group(). This puts the operations under both cgroup_tree_mutex and cgroup_mutex protection without circular dependency from kernfs active protection. Also, this means that cgroup_mutex is no longer nested below kernfs active protection. There is no longer any place where the two locks interact. This leaves cgroup_lock_live_group() without any user. Removed. This will help simplifying cgroup locking. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 21739e481006..b7cd80845f6a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -386,23 +386,6 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the mutex should be later unlocked. On - * failure returns false with no lock held. - */ -static bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_dead(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} - /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -2306,14 +2289,15 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; - struct cgroup *cgrp = of_css(of)->cgroup; + struct cgroup *cgrp; pid_t pid; int ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; retry_find_task: @@ -2379,7 +2363,7 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return ret ?: nbytes; } @@ -2429,17 +2413,18 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup *cgrp = of_css(of)->cgroup; - struct cgroup_root *root = cgrp->root; + struct cgroup *cgrp; - BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(cgrp)) + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(root->release_agent_path, strstrip(buf), - sizeof(root->release_agent_path)); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return nbytes; } -- cgit v1.2.3 From 01f6474ce04fffd6282b569ac0a31f4b98d4c82a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: nest kernfs active protection under cgroup_mutex After the recent cgroup_kn_lock_live() changes, cgroup_mutex is no longer nested below kernfs active protection. The two don't have any relationship now. This patch nests kernfs active protection under cgroup_mutex. All cftype operations now require both cgroup_tree_mutex and cgroup_mutex, temporary cgroup_mutex releases over kernfs operations are removed, and cgroup_add/rm_cftypes() grab both mutexes. This makes cgroup_tree_mutex redundant, which will be removed by the next patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b7cd80845f6a..bf1d7ce250ac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1127,7 +1127,7 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) cgrp = kn->parent->priv; /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. @@ -1150,6 +1150,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1216,11 +1217,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) if (ss_mask & (1 << ssid)) cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - mutex_lock(&cgroup_mutex); for_each_subsys(ss, ssid) { struct cgroup_root *src_root; @@ -2946,6 +2945,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], int ret; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2981,6 +2981,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) int ret = 0; lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -3049,6 +3050,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) static int cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) return -ENOENT; @@ -3075,7 +3077,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) int ret; mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3106,12 +3110,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return ret; mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -4445,6 +4451,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* * This must happen before css is disassociated with its cgroup. @@ -4544,13 +4551,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Initiate massacre of all css's. cgroup_destroy_css_killed() * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. This - * involves removing the subsystem's files, drop cgroup_mutex. + * percpu refs of all css's are confirmed to be killed. */ - mutex_unlock(&cgroup_mutex); for_each_css(css, ssid, cgrp) kill_css(css); - mutex_lock(&cgroup_mutex); /* CGRP_DEAD is set, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); @@ -4567,10 +4571,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!cgrp->nr_css) cgroup_destroy_css_killed(cgrp); - /* remove @cgrp directory along with the base files */ - mutex_unlock(&cgroup_mutex); - kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ - mutex_lock(&cgroup_mutex); + /* + * Remove @cgrp directory along with the base files. @cgrp has an + * extra ref on its kn. + */ + kernfs_remove(cgrp->kn); return 0; }; -- cgit v1.2.3 From 8353da1f91f12a3079ecc849226f371242d2807c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:19:23 -0400 Subject: cgroup: remove cgroup_tree_mutex cgroup_tree_mutex was introduced to work around the circular dependency between cgroup_mutex and kernfs active protection - some kernfs file and directory operations needed cgroup_mutex putting cgroup_mutex under active protection but cgroup also needs to be able to access cgroup hierarchies and cftypes to determine which kernfs_nodes need to be removed. cgroup_tree_mutex nested above both cgroup_mutex and kernfs active protection and used to protect the hierarchy and cftypes. While this worked, it added a lot of double lockings and was generally cumbersome. kernfs provides a mechanism to opt out of active protection and cgroup was already using it for removal and subtree_control. There's no reason to mix both methods of avoiding circular locking dependency and the preceding cgroup_kn_lock_live() changes applied it to all relevant cgroup kernfs operations making it unnecessary to nest cgroup_mutex under kernfs active protection. The previous patch reversed the original lock ordering and put cgroup_mutex above kernfs active protection. After these changes, all cgroup_tree_mutex usages are now accompanied by cgroup_mutex making the former completely redundant. This patch removes cgroup_tree_mutex and all its usages. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 64 ++++++++------------------------------------------------- 1 file changed, 9 insertions(+), 55 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bf1d7ce250ac..457e52705f56 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -70,15 +70,6 @@ #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) -/* - * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file - * creation/removal and hierarchy changing operations including cgroup - * creation, removal, css association and controller rebinding. This outer - * lock is needed mainly to resolve the circular dependency between kernfs - * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. - */ -static DEFINE_MUTEX(cgroup_tree_mutex); - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -111,11 +102,10 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutexes_or_rcu_locked() \ +#define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex), \ - "cgroup_[tree_]mutex or RCU read lock required"); + "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -243,7 +233,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, { if (ss) return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_tree_mutex) || lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; @@ -347,7 +336,6 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex)))) { } \ else @@ -381,7 +369,7 @@ static int notify_on_release(const struct cgroup *cgrp) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->children, sibling) \ - if (({ lockdep_assert_held(&cgroup_tree_mutex); \ + if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else @@ -869,7 +857,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -899,7 +886,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); @@ -1096,7 +1082,6 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) cgrp = kn->parent->priv; mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); @@ -1135,7 +1120,6 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) cgroup_get(cgrp); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) @@ -1149,7 +1133,6 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1179,7 +1162,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) struct cgroup_subsys *ss; int ssid, i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); for_each_subsys(ss, ssid) { @@ -1457,7 +1439,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* See what subsystems are wanted */ @@ -1503,7 +1484,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) kfree(opts.release_agent); kfree(opts.name); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -1606,7 +1586,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) struct css_set *cset; int i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); @@ -1696,7 +1675,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -1761,9 +1739,7 @@ retry: */ if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); msleep(10); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); goto retry; } @@ -1796,7 +1772,6 @@ retry: out_unlock: mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kfree(opts.release_agent); kfree(opts.name); @@ -2507,7 +2482,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct css_set *src_cset; int ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* look up all csses currently attached to @cgrp's subtree */ @@ -2866,20 +2840,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return -EPERM; /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_tree_mutex. + * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); @@ -2944,7 +2916,6 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2980,7 +2951,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) struct cgroup_subsys_state *css; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ @@ -3049,7 +3019,6 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) static int cgroup_rm_cftypes_locked(struct cftype *cfts) { - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) @@ -3076,11 +3045,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) { int ret; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3109,7 +3076,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); @@ -3118,7 +3084,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -3158,7 +3123,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -3224,7 +3189,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3264,7 +3229,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); do { last = pos; @@ -3311,7 +3276,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -4178,7 +4143,6 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) @@ -4196,7 +4160,6 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -4399,7 +4362,6 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* @@ -4417,7 +4379,6 @@ static void css_killed_work_fn(struct work_struct *work) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); /* * Put the css refs from kill_css(). Each css holds an extra @@ -4450,7 +4411,6 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -4510,7 +4470,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) bool empty; int ssid; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -4593,7 +4552,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ @@ -4647,7 +4605,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); idr_init(&ss->css_idr); @@ -4685,7 +4642,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) cgrp_dfl_root.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); } /** @@ -4735,7 +4691,6 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ @@ -4745,7 +4700,6 @@ int __init cgroup_init(void) BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { if (ss->early_init) { -- cgit v1.2.3 From a015edd26e28afe225cdd04f25794bd2b3bbe2da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:00 -0400 Subject: cgroup: use restart_syscall() for mount retries cgroup_mount() uses dumb delay-and-retry logic to wait for cgroup_root which is being destroyed. The retry currently loops inside cgroup_mount() proper. This patch makes it return with restart_syscall() instead so that retry travels out to userland boundary. This slightly simplifies the logic and more importantly makes the retry logic behave better when the wait for some reason becomes lengthy or infinite by allowing the operation to be suspended or terminated from userland. v2: The original patch forgot to free memory allocated for @opts. Fixed. Caught by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 457e52705f56..f36fd9c15b3a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1681,7 +1681,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; -retry: + /* look for a matching existing root */ if (!opts.subsys_mask && !opts.none && !opts.name) { cgrp_dfl_root_visible = true; @@ -1740,8 +1740,8 @@ retry: if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { mutex_unlock(&cgroup_mutex); msleep(10); - mutex_lock(&cgroup_mutex); - goto retry; + ret = restart_syscall(); + goto out_free; } ret = 0; @@ -1772,7 +1772,7 @@ retry: out_unlock: mutex_unlock(&cgroup_mutex); - +out_free: kfree(opts.release_agent); kfree(opts.name); -- cgit v1.2.3 From 9d800df12d31734a6853915e9d2deb5d6747985f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:00 -0400 Subject: cgroup: rename cgroup->dummy_css to ->self and move it to the top cgroup->dummy_css is used as the placeholder css when performing css oriended operations on the cgroup. We're gonna shift more cgroup management to this css. Let's rename it to ->self and move it to the top. This is pure rename and field relocation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f36fd9c15b3a..b57a949ae4bc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -220,7 +220,7 @@ static void cgroup_idr_remove(struct idr *idr, int id) /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and @@ -235,13 +235,13 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else - return &cgrp->dummy_css; + return &cgrp->self; } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effctive css, which is defined * as the matching css of the nearest ancestor including self which has @ss @@ -254,7 +254,7 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, lockdep_assert_held(&cgroup_mutex); if (!ss) - return &cgrp->dummy_css; + return &cgrp->self; if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; @@ -288,7 +288,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) if (cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else - return &cgrp->dummy_css; + return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); @@ -1551,7 +1551,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); - cgrp->dummy_css.cgroup = cgrp; + cgrp->self.cgroup = cgrp; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -3454,7 +3454,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->dummy_css, &it); + css_task_iter_start(&from->self, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); @@ -3719,7 +3719,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -3793,7 +3793,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: @@ -4274,7 +4274,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, init_cgroup_housekeeping(cgrp); cgrp->parent = parent; - cgrp->dummy_css.parent = &parent->dummy_css; + cgrp->self.parent = &parent->self; cgrp->root = root; if (notify_on_release(parent)) -- cgit v1.2.3 From cbc125efada6d8c2555dd35e938694eb9b7cd791 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: separate out cgroup_has_live_children() from cgroup_destroy_locked() We're expecting another user. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b57a949ae4bc..3f5f48130b6b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3295,6 +3295,21 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return css_parent(pos); } +static bool cgroup_has_live_children(struct cgroup *cgrp) +{ + struct cgroup *child; + + rcu_read_lock(); + list_for_each_entry_rcu(child, &cgrp->children, sibling) { + if (!cgroup_is_dead(child)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; +} + /** * css_advance_task_iter - advance a task itererator to the next css_set * @it: the iterator to advance @@ -4465,7 +4480,6 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *child; struct cgroup_subsys_state *css; bool empty; int ssid; @@ -4487,15 +4501,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * emptiness as dead children linger on it while being destroyed; * otherwise, "rmdir parent/child parent" may fail with -EBUSY. */ - empty = true; - rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { - empty = cgroup_is_dead(child); - if (!empty) - break; - } - rcu_read_unlock(); - if (!empty) + if (cgroup_has_live_children(cgrp)) return -EBUSY; /* -- cgit v1.2.3 From 9e4173e1f24fa3bd562f13b92ee34c7dfb1db7c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: move check_for_release(parent) call to the end of cgroup_destroy_locked() Currently, check_for_release() on the parent of a destroyed cgroup is invoked from cgroup_destroy_css_killed(). This is because this is where the destroyed cgroup can be removed from the parent's children list. check_for_release() tests the emptiness of the list directly, so invoking it before removing the cgroup from the list makes it think that the parent still has children even when it no longer does. This patch updates check_for_release() to use cgroup_has_live_children() instead of directly testing ->children emptiness and moves check_for_release(parent) earlier to the end of cgroup_destroy_locked(). As cgroup_has_live_children() ignores cgroups marked DEAD, check_for_release() functions correctly as long as it's called after asserting DEAD. This makes release notification slightly more timely and more importantly enables further simplification of cgroup destruction path. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f5f48130b6b..061569fba245 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4542,6 +4542,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); + set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); + check_for_release(cgrp->parent); + return 0; }; @@ -4556,17 +4559,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { - struct cgroup *parent = cgrp->parent; - lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); cgroup_put(cgrp); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); } static int cgroup_rmdir(struct kernfs_node *kn) @@ -5006,7 +5004,7 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { + list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue -- cgit v1.2.3 From 4e4e28472365f8c7a7c55f6b5706f68bc40c5b13 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: move cgroup->sibling unlinking to cgroup_put() Move cgroup->sibling unlinking from cgroup_destroy_css_killed() to cgroup_put(). This is later but still before the RCU grace period, so it doesn't break css_next_child() although there now is a larger window in which a dead cgroup is visible during css iteration. As css iteration always could have included offline csses, this doesn't affect correctness; however, it does make css_next_child() fall back to reiterting mode more often. This also makes cgroup_put() directly take cgroup_mutex, which limits where it can be called from. These are not immediately problematic and will be dealt with later. This change enables simplification of cgroup destruction path. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 061569fba245..e9aa2a51ca68 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1056,6 +1056,11 @@ static void cgroup_put(struct cgroup *cgrp) if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) return; + /* delete this cgroup from parent->children */ + mutex_lock(&cgroup_mutex); + list_del_rcu(&cgrp->sibling); + mutex_unlock(&cgroup_mutex); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4561,9 +4566,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { lockdep_assert_held(&cgroup_mutex); - /* delete this cgroup from parent->children */ - list_del_rcu(&cgrp->sibling); - cgroup_put(cgrp); } -- cgit v1.2.3 From 249f3468a282dcbad53484c821bebb447f14ee03 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: remove cgroup_destory_css_killed() cgroup_destroy_css_killed() is cgroup destruction stage which happens after all csses are offlined. After the recent updates, it no longer does anything other than putting the base reference. This patch removes the function and makes cgroup_destroy_locked() put the base ref at the end isntead. This also makes cgroup->nr_css unnecessary. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 62 ++++++--------------------------------------------------- 1 file changed, 6 insertions(+), 56 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9aa2a51ca68..4a94b0be598d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -178,7 +178,6 @@ static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); -static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); static void kill_css(struct cgroup_subsys_state *css); @@ -4169,7 +4168,6 @@ static int online_css(struct cgroup_subsys_state *css) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; - css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; @@ -4189,7 +4187,6 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; - css->cgroup->nr_css--; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); @@ -4374,39 +4371,18 @@ out_destroy: /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget_online() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. Tell the subsystem to + * initate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - struct cgroup *cgrp = css->cgroup; mutex_lock(&cgroup_mutex); - - /* - * css_tryget_online() is guaranteed to fail now. Tell subsystems - * to initate destruction. - */ offline_css(css); - - /* - * If @cgrp is marked dead, it's waiting for refs of all css's to - * be disabled before proceeding to the second phase of cgroup - * destruction. If we are the last one, kick it off. - */ - if (!cgrp->nr_css && cgroup_is_dead(cgrp)) - cgroup_destroy_css_killed(cgrp); - mutex_unlock(&cgroup_mutex); - /* - * Put the css refs from kill_css(). Each css holds an extra - * reference to the cgroup's dentry and cgroup removal proceeds - * regardless of css refs. On the last put of each css, whenever - * that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ css_put(css); } @@ -4518,11 +4494,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ set_bit(CGRP_DEAD, &cgrp->flags); - /* - * Initiate massacre of all css's. cgroup_destroy_css_killed() - * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. - */ + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); @@ -4532,15 +4504,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - /* - * If @cgrp has css's attached, the second stage of cgroup - * destruction is kicked off from css_killed_work_fn() after the - * refs of all attached css's are killed. If @cgrp doesn't have - * any css, we kick it off here. - */ - if (!cgrp->nr_css) - cgroup_destroy_css_killed(cgrp); - /* * Remove @cgrp directory along with the base files. @cgrp has an * extra ref on its kn. @@ -4550,25 +4513,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); check_for_release(cgrp->parent); + /* put the base reference */ + cgroup_put(cgrp); + return 0; }; -/** - * cgroup_destroy_css_killed - the second step of cgroup destruction - * @cgrp: the cgroup whose csses have just finished offlining - * - * This function is invoked from a work item for a cgroup which is being - * destroyed after all css's are offlined and performs the rest of - * destruction. This is the second step of destruction described in the - * comment above cgroup_destroy_locked(). - */ -static void cgroup_destroy_css_killed(struct cgroup *cgrp) -{ - lockdep_assert_held(&cgroup_mutex); - - cgroup_put(cgrp); -} - static int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; -- cgit v1.2.3 From 25e15d835036a70a53dcc993beaa036f8919a373 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: bounce css release through css->destroy_work css release is planned to do more and would require process context. Bounce it through css->destroy_work. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4a94b0be598d..e694f4153edb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4126,10 +4126,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void css_release(struct percpu_ref *ref) +static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(ref, struct cgroup_subsys_state, refcnt); + container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; cgroup_idr_remove(&ss->css_idr, css->id); @@ -4137,6 +4137,15 @@ static void css_release(struct percpu_ref *ref) call_rcu(&css->rcu_head, css_free_rcu_fn); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + INIT_WORK(&css->destroy_work, css_release_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); +} + static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { -- cgit v1.2.3 From 9395a4500404e05173eda9a2d198b6fa500e90c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: enable refcnting for root csses Currently, css_get(), css_tryget() and css_tryget_online() are noops for root csses as an optimization; however, we're planning to use css refcnts to track of cgroup lifetime too and root cgroups also need to be reference counted. Since css has been converted to percpu_refcnt, the overhead of refcnting is miniscule and this optimization isn't too meaningful anymore. Furthermore, controllers which optimize the root cgroup often never even invoke these functions in their hot paths. This patch enables refcnting for root csses too. This makes CSS_ROOT flag unused and removes it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e694f4153edb..cb5864e36f99 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4158,8 +4158,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, if (cgrp->parent) { css->parent = cgroup_css(cgrp->parent, ss); css_get(css->parent); - } else { - css->flags |= CSS_ROOT; } BUG_ON(cgroup_css(cgrp, ss)); @@ -4582,9 +4580,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); if (early) { - /* idr_alloc() can't be called safely during early init */ + /* allocation can't be done safely during early init */ css->id = 1; } else { + BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } @@ -4671,6 +4670,7 @@ int __init cgroup_init(void) struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; + BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); -- cgit v1.2.3 From 9d755d33f0db8c9b49438f71b38a56e375b34360 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: use cgroup->self.refcnt for cgroup refcnting Currently cgroup implements refcnting separately using atomic_t cgroup->refcnt. The destruction paths of cgroup and css are rather complex and bear a lot of similiarities including the use of RCU and bouncing to a work item. This patch makes cgroup use the refcnt of self css for refcnting instead of using its own. This makes cgroup refcnting use css's percpu refcnt and share the destruction mechanism. * css_release_work_fn() and css_free_work_fn() are updated to handle both csses and cgroups. This is a bit messy but should do until we can make cgroup->self a full css, which currently can't be done thanks to multiple hierarchies. * cgroup_destroy_locked() now performs percpu_ref_kill(&cgrp->self.refcnt) instead of cgroup_put(cgrp). * Negative refcnt sanity check in cgroup_get() is no longer necessary as percpu_ref already handles it. * Similarly, as a cgroup which hasn't been killed will never be released regardless of its refcnt value and percpu_ref has sanity check on kill, cgroup_is_dead() sanity check in cgroup_put() is no longer necessary. * As whether a refcnt reached zero or not can only be decided after the reference count is killed, cgroup_root->cgrp's refcnting can no longer be used to decide whether to kill the root or not. Let's make cgroup_kill_sb() explicitly initiate destruction if the root doesn't have any children. This makes sense anyway as unmounted cgroup hierarchy without any children should be destroyed. While this is a bit messy, this will allow pushing more bookkeeping towards cgroup->self and thus handling cgroups and csses in more uniform way. In the very long term, it should be possible to introduce a base subsystem and convert the self css to a proper one making things whole lot simpler and unified. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 146 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 80 insertions(+), 66 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cb5864e36f99..c01e8e8dfad0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -176,10 +176,12 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); +static bool cgroup_has_live_children(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -1008,62 +1010,15 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static void cgroup_free_fn(struct work_struct *work) -{ - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - - atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); - - if (cgrp->parent) { - /* - * We get a ref to the parent, and put the ref when this - * cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - cgroup_put(cgrp->parent); - kernfs_put(cgrp->kn); - kfree(cgrp); - } else { - /* - * This is root cgroup's refcnt reaching zero, which - * indicates that the root should be released. - */ - cgroup_destroy_root(cgrp->root); - } -} - -static void cgroup_free_rcu(struct rcu_head *head) -{ - struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - - INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - queue_work(cgroup_destroy_wq, &cgrp->destroy_work); -} - static void cgroup_get(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); - WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); - atomic_inc(&cgrp->refcnt); + css_get(&cgrp->self); } static void cgroup_put(struct cgroup *cgrp) { - if (!atomic_dec_and_test(&cgrp->refcnt)) - return; - if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) - return; - - /* delete this cgroup from parent->children */ - mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->sibling); - mutex_unlock(&cgroup_mutex); - - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); + css_put(&cgrp->self); } /** @@ -1548,7 +1503,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) struct cgroup_subsys *ss; int ssid; - atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->cset_links); @@ -1597,6 +1551,10 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + if (ret) + goto out; + /* * We're accessing css_set_count without locking css_set_rwsem here, * but that's OK - it can only be increased by someone holding @@ -1605,11 +1563,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) */ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) - goto out; + goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) - goto out; + goto cancel_ref; root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED, @@ -1657,6 +1615,8 @@ destroy_root: root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); +cancel_ref: + percpu_ref_cancel_init(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -1735,13 +1695,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } /* - * A root's lifetime is governed by its root cgroup. Zero - * ref indicate that the root is being destroyed. Wait for - * destruction to complete so that the subsystems are free. - * We can use wait_queue for the wait but this path is - * super cold. Let's just sleep for a bit and retry. + * A root's lifetime is governed by its root cgroup. + * tryget_live failure indicate that the root is being + * destroyed. Wait for destruction to complete so that the + * subsystems are free. We can use wait_queue for the wait + * but this path is super cold. Let's just sleep for a bit + * and retry. */ - if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { + if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); msleep(10); ret = restart_syscall(); @@ -1794,7 +1755,16 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - cgroup_put(&root->cgrp); + /* + * If @root doesn't have any mounts or children, start killing it. + * This prevents new mounts by disabling percpu_ref_tryget_live(). + * cgroup_mount() may wait for @root's release. + */ + if (cgroup_has_live_children(&root->cgrp)) + cgroup_put(&root->cgrp); + else + percpu_ref_kill(&root->cgrp.self.refcnt); + kernfs_kill_sb(sb); } @@ -4110,11 +4080,37 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - if (css->parent) - css_put(css->parent); + if (css->ss) { + /* css free path */ + if (css->parent) + css_put(css->parent); - css->ss->css_free(css); - cgroup_put(cgrp); + css->ss->css_free(css); + cgroup_put(cgrp); + } else { + /* cgroup free path */ + atomic_dec(&cgrp->root->nr_cgrps); + cgroup_pidlist_destroy_all(cgrp); + + if (cgrp->parent) { + /* + * We get a ref to the parent, and put the ref when + * this cgroup is being freed, so it's guaranteed + * that the parent won't be destroyed before its + * children. + */ + cgroup_put(cgrp->parent); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is root cgroup's refcnt reaching zero, + * which indicates that the root should be + * released. + */ + cgroup_destroy_root(cgrp->root); + } + } } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4131,8 +4127,20 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; - cgroup_idr_remove(&ss->css_idr, css->id); + if (ss) { + /* css release path */ + cgroup_idr_remove(&ss->css_idr, css->id); + } else { + /* cgroup release path */ + mutex_lock(&cgroup_mutex); + list_del_rcu(&cgrp->sibling); + mutex_unlock(&cgroup_mutex); + + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + } call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4285,6 +4293,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } + ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + if (ret) + goto out_free_cgrp; + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4292,7 +4304,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_free_cgrp; + goto out_cancel_ref; } init_cgroup_housekeeping(cgrp); @@ -4365,6 +4377,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, out_free_id: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_cancel_ref: + percpu_ref_cancel_init(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); out_unlock: @@ -4521,7 +4535,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) check_for_release(cgrp->parent); /* put the base reference */ - cgroup_put(cgrp); + percpu_ref_kill(&cgrp->self.refcnt); return 0; }; -- cgit v1.2.3 From 3b514d24e200fcdcde0a57c354a51d3677a86743 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:47 -0400 Subject: cgroup: skip refcnting on normal root csses and cgrp_dfl_root self css 9395a4500404 ("cgroup: enable refcnting for root csses") enabled reference counting for root csses (cgroup_subsys_states) so that cgroup's self csses can be used to manage the lifetime of the containing cgroups. Unfortunately, this change was incorrect. During early init, cgrp_dfl_root self css refcnt is used. percpu_ref can't initialized during early init and its initialization is deferred till cgroup_init() time. This means that cpu was using percpu_ref which wasn't properly initialized. Due to the way percpu variables are laid out on x86, this didn't blow up immediately on x86 but ended up incrementing and decrementing the percpu variable at offset zero, whatever it may be; however, on other archs, this caused fault and early boot failure. As cgroup self csses for root cgroups of non-dfl hierarchies need working refcounting, we can't revert 9395a4500404. This patch adds CSS_NO_REF which explicitly inhibits reference counting on the css and sets it on all normal (non-self) csses and cgroup_dfl_root self css. v2: cgrp_dfl_root.self is the offending one. Set the flag on it. Signed-off-by: Tejun Heo Reported-by: Stephen Warren Tested-by: Stephen Warren Fixes: 9395a4500404 ("cgroup: enable refcnting for root csses") --- kernel/cgroup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c01e8e8dfad0..0343d7ee6d62 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4593,11 +4593,17 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + + /* + * Root csses are never destroyed and we can't initialize + * percpu_ref during early init. Disable refcnting. + */ + css->flags |= CSS_NO_REF; + if (early) { /* allocation can't be done safely during early init */ css->id = 1; } else { - BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } @@ -4636,6 +4642,8 @@ int __init cgroup_init_early(void) int i; init_cgroup_root(&cgrp_dfl_root, &opts); + cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; + RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { @@ -4684,7 +4692,6 @@ int __init cgroup_init(void) struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; - BUG_ON(percpu_ref_init(&css->refcnt, css_release)); css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); -- cgit v1.2.3 From 5c9d535b893f30266ea29fe377cb9b002fcd76aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: remove css_parent() cgroup in general is moving towards using cgroup_subsys_state as the fundamental structural component and css_parent() was introduced to convert from using cgroup->parent to css->parent. It was quite some time ago and we're moving forward with making css more prominent. This patch drops the trivial wrapper css_parent() and let the users dereference css->parent. While at it, explicitly mark fields of css which are public and immutable. v2: New usage from device_cgroup.c converted. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Neil Horman Acked-by: "David S. Miller" Acked-by: Li Zefan Cc: Vivek Goyal Cc: Jens Axboe Cc: Peter Zijlstra Cc: Johannes Weiner --- kernel/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0343d7ee6d62..929bbbc539e9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3176,10 +3176,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return next; - pos = css_parent(pos); + pos = pos->parent; } return NULL; @@ -3261,12 +3261,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - return css_parent(pos); + return pos->parent; } static bool cgroup_has_live_children(struct cgroup *cgrp) -- cgit v1.2.3 From d51f39b05ce0008118c45945e681b20484990571 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: remove cgroup->parent cgroup->parent is redundant as cgroup->self.parent can also be used to determine the parent cgroup and we're moving towards using cgroup_subsys_states as the fundamental structural blocks. This patch introduces cgroup_parent() which follows cgroup->self.parent and removes cgroup->parent. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 52 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 929bbbc539e9..8c67a739aea4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -218,6 +218,15 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } +static struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *parent_css = cgrp->self.parent; + + if (parent_css) + return container_of(parent_css, struct cgroup, self); + return NULL; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -260,9 +269,9 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; - while (cgrp->parent && - !(cgrp->parent->child_subsys_mask & (1 << ss->id))) - cgrp = cgrp->parent; + while (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + cgrp = cgroup_parent(cgrp); return cgroup_css(cgrp, ss); } @@ -307,7 +316,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) while (cgrp) { if (cgrp == ancestor) return true; - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } return false; } @@ -454,7 +463,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (cgrp->populated_kn) kernfs_notify(cgrp->populated_kn); - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } while (cgrp); } @@ -2018,7 +2027,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, * Except for the root, child_subsys_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent && + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && dst_cgrp->child_subsys_mask) return -EBUSY; @@ -2427,7 +2436,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask); + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); return 0; } @@ -2610,8 +2619,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, /* unavailable or not enabled on the parent? */ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgrp->parent && - !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { ret = -ENOENT; goto out_unlock; } @@ -2640,7 +2649,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * Except for the root, child_subsys_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ - if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) { + if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { ret = -EBUSY; goto out_unlock; } @@ -2898,9 +2907,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if (is_add) { @@ -4092,14 +4101,14 @@ static void css_free_work_fn(struct work_struct *work) atomic_dec(&cgrp->root->nr_cgrps); cgroup_pidlist_destroy_all(cgrp); - if (cgrp->parent) { + if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children. */ - cgroup_put(cgrp->parent); + cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); kfree(cgrp); } else { @@ -4163,8 +4172,8 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; css->flags = 0; - if (cgrp->parent) { - css->parent = cgroup_css(cgrp->parent, ss); + if (cgroup_parent(cgrp)) { + css->parent = cgroup_css(cgroup_parent(cgrp), ss); css_get(css->parent); } @@ -4218,7 +4227,7 @@ static void offline_css(struct cgroup_subsys_state *css) */ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - struct cgroup *parent = cgrp->parent; + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; int err; @@ -4251,7 +4260,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_clear_dir; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { + cgroup_parent(parent)) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) @@ -4309,7 +4318,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, init_cgroup_housekeeping(cgrp); - cgrp->parent = parent; cgrp->self.parent = &parent->self; cgrp->root = root; @@ -4336,7 +4344,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + list_add_tail_rcu(&cgrp->sibling, &cgroup_parent(cgrp)->children); atomic_inc(&root->nr_cgrps); cgroup_get(parent); @@ -4531,8 +4539,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - set_bit(CGRP_RELEASABLE, &cgrp->parent->flags); - check_for_release(cgrp->parent); + set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); + check_for_release(cgroup_parent(cgrp)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); -- cgit v1.2.3 From d5c419b68e368fdd9f1857bf8d4bb4480edb9b80 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: move cgroup->sibling and ->children into cgroup_subsys_state We're moving towards using cgroup_subsys_states as the fundamental structural blocks. Let's move cgroup->sibling and ->children into cgroup_subsys_state. This is pure move without functional change and only cgroup->self's fields are actually used. Other csses will make use of the fields later. While at it, update init_and_link_css() so that it zeroes the whole css before initializing it and remove explicit zeroing of ->flags. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c67a739aea4..5385839e727b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -378,7 +378,7 @@ static int notify_on_release(const struct cgroup *cgrp) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->children, sibling) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ @@ -870,7 +870,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) mutex_lock(&cgroup_mutex); BUG_ON(atomic_read(&root->nr_cgrps)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); @@ -1432,7 +1432,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.children)) { + if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } @@ -1512,8 +1512,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) struct cgroup_subsys *ss; int ssid; - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->self.sibling); + INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); @@ -1612,7 +1612,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) link_css_set(&tmp_links, cset, root_cgrp); up_write(&css_set_rwsem); - BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); @@ -3128,11 +3128,11 @@ css_next_child(struct cgroup_subsys_state *pos_css, * cgroup is removed or iteration and removal race. */ if (!pos) { - next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); } else if (likely(!cgroup_is_dead(pos))) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { - list_for_each_entry_rcu(next, &cgrp->children, sibling) + list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) if (next->serial_nr > pos->serial_nr) break; } @@ -3142,12 +3142,12 @@ css_next_child(struct cgroup_subsys_state *pos_css, * the next sibling; however, it might have @ss disabled. If so, * fast-forward to the next enabled one. */ - while (&next->sibling != &cgrp->children) { + while (&next->self.sibling != &cgrp->self.children) { struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); if (next_css) return next_css; - next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling); } return NULL; } @@ -3283,7 +3283,7 @@ static bool cgroup_has_live_children(struct cgroup *cgrp) struct cgroup *child; rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { + list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) { if (!cgroup_is_dead(child)) { rcu_read_unlock(); return true; @@ -4144,7 +4144,7 @@ static void css_release_work_fn(struct work_struct *work) } else { /* cgroup release path */ mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->sibling); + list_del_rcu(&cgrp->self.sibling); mutex_unlock(&cgroup_mutex); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); @@ -4168,9 +4168,11 @@ static void init_and_link_css(struct cgroup_subsys_state *css, { cgroup_get(cgrp); + memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; - css->flags = 0; + INIT_LIST_HEAD(&css->sibling); + INIT_LIST_HEAD(&css->children); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4344,7 +4346,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgroup_parent(cgrp)->children); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get(parent); @@ -4507,9 +4509,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Make sure there's no live children. We can't test ->children - * emptiness as dead children linger on it while being destroyed; - * otherwise, "rmdir parent/child parent" may fail with -EBUSY. + * Make sure there's no live children. We can't test emptiness of + * ->self.children as dead children linger on it while being + * drained; otherwise, "rmdir parent/child parent" may fail. */ if (cgroup_has_live_children(cgrp)) return -EBUSY; -- cgit v1.2.3 From 1fed1b2e36ba1aa0257004a97e75bbdb70f216b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: link all cgroup_subsys_states in their sibling lists Currently, while all csses have ->children and ->sibling, only the self csses of cgroups make use of them. This patch makes all other csses to link themselves on the sibling lists too. This will be used to update css iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5385839e727b..dcb06e181ce4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4138,19 +4138,21 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; + mutex_lock(&cgroup_mutex); + + list_del_rcu(&css->sibling); + if (ss) { /* css release path */ cgroup_idr_remove(&ss->css_idr, css->id); } else { /* cgroup release path */ - mutex_lock(&cgroup_mutex); - list_del_rcu(&cgrp->self.sibling); - mutex_unlock(&cgroup_mutex); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; } + mutex_unlock(&cgroup_mutex); + call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4230,12 +4232,13 @@ static void offline_css(struct cgroup_subsys_state *css) static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); - css = ss->css_alloc(cgroup_css(parent, ss)); + css = ss->css_alloc(parent_css); if (IS_ERR(css)) return PTR_ERR(css); @@ -4255,11 +4258,12 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) goto err_free_id; /* @css is ready to be brought online now, make it visible */ + list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) - goto err_clear_dir; + goto err_list_del; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && cgroup_parent(parent)) { @@ -4272,7 +4276,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) return 0; -err_clear_dir: +err_list_del: + list_del_rcu(&css->sibling); cgroup_clear_dir(css->cgroup, 1 << css->ss->id); err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); -- cgit v1.2.3 From 0cb51d71c1fa9234afe4213089844be76ec1765a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: move cgroup->serial_nr into cgroup_subsys_state We're moving towards using cgroup_subsys_states as the fundamental structural blocks. All csses including the cgroup->self and actual ones now form trees through css->children and ->sibling which follow the same rules as what cgroup->children and ->sibling followed. This patch moves cgroup->serial_nr which is used to implement css iteration into css. Note that all csses, regardless of their types, allocate their serial numbers from the same monotonically increasing counter. This doesn't affect the ordering needed by css iteration or cause any other material behavior changes. This will be used to update css iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dcb06e181ce4..d5af128ec1ec 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -157,14 +157,13 @@ static int cgroup_root_count; static DEFINE_IDR(cgroup_hierarchy_idr); /* - * Assign a monotonically increasing serial number to cgroups. It - * guarantees cgroups with bigger numbers are newer than those with smaller - * numbers. Also, as cgroups are always appended to the parent's - * ->children list, it guarantees that sibling cgroups are always sorted in - * the ascending serial number order on the list. Protected by - * cgroup_mutex. + * Assign a monotonically increasing serial number to csses. It guarantees + * cgroups with bigger numbers are newer than those with smaller numbers. + * Also, as csses are always appended to the parent's ->children list, it + * guarantees that sibling csses are always sorted in the ascending serial + * number order on the list. Protected by cgroup_mutex. */ -static u64 cgroup_serial_nr_next = 1; +static u64 css_serial_nr_next = 1; /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do @@ -3133,7 +3132,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) - if (next->serial_nr > pos->serial_nr) + if (next->self.serial_nr > pos->self.serial_nr) break; } @@ -4168,6 +4167,8 @@ static void css_release(struct percpu_ref *ref) static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { + lockdep_assert_held(&cgroup_mutex); + cgroup_get(cgrp); memset(css, 0, sizeof(*css)); @@ -4175,6 +4176,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4348,7 +4350,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ kernfs_get(kn); - cgrp->serial_nr = cgroup_serial_nr_next++; + cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); -- cgit v1.2.3 From de3f034182ecbf0efbcef7ab8b253c6c3049a592 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: introduce CSS_RELEASED and reduce css iteration fallback window css iterations allow the caller to drop RCU read lock. As long as the caller keeps the current position accessible, it can simply re-grab RCU read lock later and continue iteration. This is achieved by using CGRP_DEAD to detect whether the current positions next pointer is safe to dereference and if not re-iterate from the beginning to the next position using ->serial_nr. CGRP_DEAD is used as the marker to invalidate the next pointer and the only requirement is that the marker is set before the next sibling starts its RCU grace period. Because CGRP_DEAD is set at the end of cgroup_destroy_locked() but the cgroup is unlinked when the reference count reaches zero, we currently have a rather large window where this fallback re-iteration logic can be triggered. This patch introduces CSS_RELEASED which is set when a css is unlinked from its sibling list. This still keeps the re-iteration logic working while drastically reducing the window of its activation. While at it, rewrite the comment in css_next_child() to reflect the new flag and better explain the synchronization. This will also enable iterating csses directly instead of through cgroups. v2: CSS_RELEASED now assigned to 1 << 2 as 1 << 0 is used by CSS_NO_REF. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d5af128ec1ec..5544e685f2da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3108,27 +3108,28 @@ css_next_child(struct cgroup_subsys_state *pos_css, cgroup_assert_mutex_or_rcu_locked(); /* - * @pos could already have been removed. Once a cgroup is removed, - * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_DEAD assertion is serialized and happens - * before the cgroup is taken off the ->sibling list, if we see it - * unasserted, it's guaranteed that the next sibling hasn't - * finished its grace period even if it's already removed, and thus - * safe to dereference from this RCU critical section. If - * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed - * to be visible as %true here. + * @pos could already have been unlinked from the sibling list. + * Once a cgroup is removed, its ->sibling.next is no longer + * updated when its next sibling changes. CSS_RELEASED is set when + * @pos is taken off list, at which time its next pointer is valid, + * and, as releases are serialized, the one pointed to by the next + * pointer is guaranteed to not have started release yet. This + * implies that if we observe !CSS_RELEASED on @pos in this RCU + * critical section, the one pointed to by its next pointer is + * guaranteed to not have finished its RCU grace period even if we + * have dropped rcu_read_lock() inbetween iterations. * - * If @pos is dead, its next pointer can't be dereferenced; - * however, as each cgroup is given a monotonically increasing - * unique serial number and always appended to the sibling list, - * the next one can be found by walking the parent's children until - * we see a cgroup with higher serial number than @pos's. While - * this path can be slower, it's taken only when either the current - * cgroup is removed or iteration and removal race. + * If @pos has CSS_RELEASED set, its next pointer can't be + * dereferenced; however, as each css is given a monotonically + * increasing unique serial number and always appended to the + * sibling list, the next one can be found by walking the parent's + * children until the first css with higher serial number than + * @pos's. While this path can be slower, it happens iff iteration + * races against release and the race window is very small. */ if (!pos) { next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); - } else if (likely(!cgroup_is_dead(pos))) { + } else if (likely(!(pos->self.flags & CSS_RELEASED))) { next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); } else { list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) @@ -4139,6 +4140,7 @@ static void css_release_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); + css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (ss) { @@ -4525,10 +4527,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by css_next_child() to - * resume iteration after dropping RCU read lock. See - * css_next_child() for details. + * creation by disabling cgroup_lock_live_group(). */ set_bit(CGRP_DEAD, &cgrp->flags); -- cgit v1.2.3 From c2931b70a32c705b9bd5762f5044f9eac8a52bb3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:51 -0400 Subject: cgroup: iterate cgroup_subsys_states directly Currently, css_next_child() is implemented as finding the next child cgroup which has the css enabled, which used to be the only way to do it as only cgroups participated in sibling lists and thus could be iteratd. This works as long as what's required during iteration is not missing online csses; however, it turns out that there are use cases where offlined but not yet released csses need to be iterated. This is difficult to implement through cgroup iteration the unified hierarchy as there may be multiple dying csses for the same subsystem associated with single cgroup. After the recent changes, the cgroup self and regular csses behave identically in how they're linked and unlinked from the sibling lists including assertion of CSS_RELEASED and css_next_child() can simply switch to iterating csses directly. This both simplifies the logic and ensures that all visible non-released csses are included in the iteration whether there are multiple dying csses for a subsystem or not. As all other iterators depend on css_next_child() for sibling iteration, this changes behaviors of all css iterators. Add and update explanations on the css states which are included in traversal to all iterators. As css iteration could always contain offlined csses, this shouldn't break any of the current users and new usages which need iteration of all on and offline csses can make use of the new semantics. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner --- kernel/cgroup.c | 62 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5544e685f2da..097a1fc1e1e8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3089,21 +3089,25 @@ static int cgroup_task_count(const struct cgroup *cgrp) /** * css_next_child - find the next child of a given css - * @pos_css: the current position (%NULL to initiate traversal) - * @parent_css: css whose children to walk + * @pos: the current position (%NULL to initiate traversal) + * @parent: css whose children to walk * - * This function returns the next child of @parent_css and should be called + * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is - * that @parent_css and @pos_css are accessible. The next sibling is - * guaranteed to be returned regardless of their states. + * that @parent and @pos are accessible. The next sibling is guaranteed to + * be returned regardless of their states. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ -struct cgroup_subsys_state * -css_next_child(struct cgroup_subsys_state *pos_css, - struct cgroup_subsys_state *parent_css) +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent) { - struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; - struct cgroup *cgrp = parent_css->cgroup; - struct cgroup *next; + struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); @@ -3128,27 +3132,21 @@ css_next_child(struct cgroup_subsys_state *pos_css, * races against release and the race window is very small. */ if (!pos) { - next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); - } else if (likely(!(pos->self.flags & CSS_RELEASED))) { - next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); + next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); + } else if (likely(!(pos->flags & CSS_RELEASED))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) - if (next->self.serial_nr > pos->self.serial_nr) + list_for_each_entry_rcu(next, &parent->children, sibling) + if (next->serial_nr > pos->serial_nr) break; } /* * @next, if not pointing to the head, can be dereferenced and is - * the next sibling; however, it might have @ss disabled. If so, - * fast-forward to the next enabled one. + * the next sibling. */ - while (&next->self.sibling != &cgrp->self.children) { - struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); - - if (next_css) - return next_css; - next = list_entry_rcu(next->self.sibling.next, struct cgroup, self.sibling); - } + if (&next->sibling != &parent->children) + return next; return NULL; } @@ -3165,6 +3163,13 @@ css_next_child(struct cgroup_subsys_state *pos_css, * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @root are accessible and @pos is a descendant of @root. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -3252,6 +3257,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * section. This function will return the correct next descendant as long * as both @pos and @cgroup are accessible and @pos is a descendant of * @cgroup. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, -- cgit v1.2.3 From 184faf32328c65c9d86b19577b8d8b90bdd2cd2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:51 -0400 Subject: cgroup: use CSS_ONLINE instead of CGRP_DEAD Use CSS_ONLINE on the self css to indicate whether a cgroup has been killed instead of CGRP_DEAD. This will allow re-using css online test for cgroup liveliness test. This doesn't introduce any functional change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 097a1fc1e1e8..004004fd0ded 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -278,7 +278,7 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - return test_bit(CGRP_DEAD, &cgrp->flags); + return !(cgrp->self.flags & CSS_ONLINE); } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) @@ -1518,6 +1518,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; + cgrp->self.flags |= CSS_ONLINE; for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -4541,13 +4542,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). */ - set_bit(CGRP_DEAD, &cgrp->flags); + cgrp->self.flags &= ~CSS_ONLINE; /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); - /* CGRP_DEAD is set, remove from ->release_list for the last time */ + /* CSS_ONLINE is clear, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); -- cgit v1.2.3 From f3d4650015301d1c880df4523f7e7ef320a38aab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:52 -0400 Subject: cgroup: convert cgroup_has_live_children() into css_has_online_children() Now that cgroup liveliness and css onliness are the same state, convert cgroup_has_live_children() into css_has_online_children() so that it can be used for actual csses too. The function now uses css_for_each_child() for iteration and is published. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 004004fd0ded..082bb842b11a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -175,7 +175,6 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); -static bool cgroup_has_live_children(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); @@ -1769,7 +1768,7 @@ static void cgroup_kill_sb(struct super_block *sb) * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. */ - if (cgroup_has_live_children(&root->cgrp)) + if (css_has_online_children(&root->cgrp.self)) cgroup_put(&root->cgrp); else percpu_ref_kill(&root->cgrp.self.refcnt); @@ -3291,19 +3290,28 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return pos->parent; } -static bool cgroup_has_live_children(struct cgroup *cgrp) +/** + * css_has_online_children - does a css have online children + * @css: the target css + * + * Returns %true if @css has any online children; otherwise, %false. This + * function can be called from any context but the caller is responsible + * for synchronizing against on/offlining as necessary. + */ +bool css_has_online_children(struct cgroup_subsys_state *css) { - struct cgroup *child; + struct cgroup_subsys_state *child; + bool ret = false; rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->self.children, self.sibling) { - if (!cgroup_is_dead(child)) { - rcu_read_unlock(); - return true; + css_for_each_child(child, css) { + if (css->flags & CSS_ONLINE) { + ret = true; + break; } } rcu_read_unlock(); - return false; + return ret; } /** @@ -4535,7 +4543,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail. */ - if (cgroup_has_live_children(cgrp)) + if (css_has_online_children(&cgrp->self)) return -EBUSY; /* @@ -5014,8 +5022,8 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && !cgroup_has_live_children(cgrp)) { + if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && + !css_has_online_children(&cgrp->self)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue -- cgit v1.2.3 From 5533e0114425dcdb878f11b291f2727af8667a7c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 19:33:07 -0400 Subject: cgroup: disallow debug controller on the default hierarchy The debug controller, as its name suggests, exposes cgroup core internals to userland to aid debugging. Unfortunately, except for the name, there's no provision to prevent its usage in production configurations and the controller is widely enabled and mounted leaking internal details to userland. Like most other debug information, the information exposed by debug isn't interesting even for debugging itself once the related parts are working reliably. This controller has no reason for existing. This patch implements cgrp_dfl_root_inhibit_ss_mask which can suppress specific subsystems on the default hierarchy and adds the debug subsystem to it so that it can be gradually deprecated as usages move towards the unified hierarchy. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 082bb842b11a..a5f75ac4e793 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -148,6 +148,13 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* some controllers are not supported in the default hierarchy */ +static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 +#ifdef CONFIG_CGROUP_DEBUG + | (1 << debug_cgrp_id) +#endif + ; + /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -1126,6 +1133,7 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) { struct cgroup_subsys *ss; + unsigned int tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); @@ -1143,7 +1151,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) return -EBUSY; } - ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); + /* skip creating root files on dfl_root for inhibited subsystems */ + tmp_ss_mask = ss_mask; + if (dst_root == &cgrp_dfl_root) + tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + + ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); if (ret) { if (dst_root != &cgrp_dfl_root) return ret; @@ -2426,7 +2439,8 @@ static int cgroup_root_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask); + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & + ~cgrp_dfl_root_inhibit_ss_mask); return 0; } @@ -2564,7 +2578,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (tok[0] == '\0') continue; for_each_subsys(ss, ssid) { - if (ss->disabled || strcmp(tok + 1, ss->name)) + if (ss->disabled || strcmp(tok + 1, ss->name) || + ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) continue; if (*tok == '+') { -- cgit v1.2.3 From 1f779fb28aa07350d72976d304591d216ca86f0e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 4 Jun 2014 16:48:15 +0800 Subject: cgroup: don't destroy the default root The default root is allocated and initialized at boot phase, so we shouldn't destroy the default root when it's umounted, otherwise it will lead to disaster. Just try mount and then umount the default root, and the kernel will crash immediately. v2: - No need to check for CSS_NO_REF in cgroup_get/put(). (Tejun) - Better call cgroup_put() for the default root in kill_sb(). (Tejun) - Add a comment. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5f75ac4e793..3f46165829a4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1780,8 +1780,11 @@ static void cgroup_kill_sb(struct super_block *sb) * If @root doesn't have any mounts or children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. + * + * And don't kill the default root. */ - if (css_has_online_children(&root->cgrp.self)) + if (css_has_online_children(&root->cgrp.self) || + root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else percpu_ref_kill(&root->cgrp.self.refcnt); -- cgit v1.2.3 From c731ae1d0f02a300697a8b1564780ad28a6c2013 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 5 Jun 2014 17:16:30 +0800 Subject: cgroup: disallow disabled controllers on the default hierarchy After booting with cgroup_disable=memory, I still saw memcg files in the default hierarchy, and I can write to them, though it won't take effect. # dmesg ... Disabling memory control group subsystem ... # mount -t cgroup -o __DEVEL__sane_behavior xxx /cgroup # ls /cgroup ... memory.failcnt memory.move_charge_at_immigrate memory.force_empty memory.numa_stat memory.limit_in_bytes memory.oom_control ... # cat /cgroup/memory.usage_in_bytes 0 tj: Minor comment update. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f46165829a4..3edcc8ae83b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3069,6 +3069,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; + if (ss->disabled) + return 0; + if (!cfts || cfts[0].name[0] == '\0') return 0; @@ -4678,8 +4681,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) BUG_ON(online_css(css)); - cgrp_dfl_root.subsys_mask |= 1 << ss->id; - mutex_unlock(&cgroup_mutex); } @@ -4758,11 +4759,14 @@ int __init cgroup_init(void) &cgrp_dfl_root.cgrp.e_csets[ssid]); /* - * cftype registration needs kmalloc and can't be done - * during early_init. Register base cftypes separately. + * Setting dfl_root subsys_mask needs to consider the + * disabled flag and cftype registration needs kmalloc, + * both of which aren't available during early_init. */ - if (ss->base_cftypes) + if (!ss->disabled) { + cgrp_dfl_root.subsys_mask |= 1 << ss->id; WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + } } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); -- cgit v1.2.3