From 65dff759d2948cf18e2029fc5c0c595b8b7da3a5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 1 Mar 2013 15:01:56 +0800
Subject: cgroup: fix cgroup_path() vs rename() race

rename() will change dentry->d_name. The result of this race can
be worse than seeing partially rewritten name, but we might access
a stale pointer because rename() will re-allocate memory to hold
a longer name.

As accessing dentry->name must be protected by dentry->d_lock or
parent inode's i_mutex, while on the other hand cgroup-path() can
be called with some irq-safe spinlocks held, we can't generate
cgroup path using dentry->d_name.

Alternatively we make a copy of dentry->d_name and save it in
cgrp->name when a cgroup is created, and update cgrp->name at
rename().

v5: use flexible array instead of zero-size array.
v4: - allocate root_cgroup_name and all root_cgroup->name points to it.
    - add cgroup_name() wrapper.
v3: use kfree_rcu() instead of synchronize_rcu() in user-visible path.
v2: make cgrp->name RCU safe.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.h     |   2 -
 include/linux/cgroup.h |  24 +++++++++++
 kernel/cgroup.c        | 106 +++++++++++++++++++++++++++++++++++--------------
 3 files changed, 100 insertions(+), 32 deletions(-)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index f2b292925ccd..4e595ee8c915 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -247,9 +247,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
 {
 	int ret;
 
-	rcu_read_lock();
 	ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-	rcu_read_unlock();
 	if (ret)
 		strncpy(buf, "<unavailable>", buflen);
 	return ret;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f55..75c6ec1ba1ba 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -150,6 +150,11 @@ enum {
 	CGRP_CPUSET_CLONE_CHILDREN,
 };
 
+struct cgroup_name {
+	struct rcu_head rcu_head;
+	char name[];
+};
+
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
@@ -172,6 +177,19 @@ struct cgroup {
 	struct cgroup *parent;		/* my parent */
 	struct dentry *dentry;		/* cgroup fs entry, RCU protected */
 
+	/*
+	 * This is a copy of dentry->d_name, and it's needed because
+	 * we can't use dentry->d_name in cgroup_path().
+	 *
+	 * You must acquire rcu_read_lock() to access cgrp->name, and
+	 * the only place that can change it is rename(), which is
+	 * protected by parent dir's i_mutex.
+	 *
+	 * Normally you should use cgroup_name() wrapper rather than
+	 * access it directly.
+	 */
+	struct cgroup_name __rcu *name;
+
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
@@ -404,6 +422,12 @@ struct cgroup_scanner {
 	void *data;
 };
 
+/* Caller should hold rcu_read_lock() */
+static inline const char *cgroup_name(const struct cgroup *cgrp)
+{
+	return rcu_dereference(cgrp->name)->name;
+}
+
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..50682168abc2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -238,6 +238,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
 #define dummytop (&rootnode.top_cgroup)
 
+static struct cgroup_name root_cgroup_name = { .name = "/" };
+
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
  * extra work in the fork/exit path if none of the subsystems need to
@@ -859,6 +861,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 	return inode;
 }
 
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+{
+	struct cgroup_name *name;
+
+	name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name)
+		return NULL;
+	strcpy(name->name, dentry->d_name.name);
+	return name;
+}
+
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -889,6 +902,7 @@ static void cgroup_free_fn(struct work_struct *work)
 	simple_xattrs_free(&cgrp->xattrs);
 
 	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+	kfree(rcu_dereference_raw(cgrp->name));
 	kfree(cgrp);
 }
 
@@ -1421,6 +1435,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	INIT_LIST_HEAD(&root->allcg_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
+	cgrp->name = &root_cgroup_name;
 	cgrp->top_cgroup = cgrp;
 	init_cgroup_housekeeping(cgrp);
 	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
@@ -1769,49 +1784,45 @@ static struct kobject *cgroup_kobj;
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference.  Writes path of cgroup into buf.  Returns 0 on success,
- * -errno on error.
+ * Writes path of cgroup into buf.  Returns 0 on success, -errno on error.
+ *
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
  */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
-	struct dentry *dentry = cgrp->dentry;
+	int ret = -ENAMETOOLONG;
 	char *start;
 
-	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
-			   "cgroup_path() called without proper locking");
-
-	if (cgrp == dummytop) {
-		/*
-		 * Inactive subsystems have no dentry for their root
-		 * cgroup
-		 */
-		strcpy(buf, "/");
-		return 0;
-	}
-
 	start = buf + buflen - 1;
-
 	*start = '\0';
-	for (;;) {
-		int len = dentry->d_name.len;
 
+	rcu_read_lock();
+	while (cgrp) {
+		const char *name = cgroup_name(cgrp);
+		int len;
+
+		len = strlen(name);
 		if ((start -= len) < buf)
-			return -ENAMETOOLONG;
-		memcpy(start, dentry->d_name.name, len);
-		cgrp = cgrp->parent;
-		if (!cgrp)
-			break;
+			goto out;
+		memcpy(start, name, len);
 
-		dentry = cgrp->dentry;
 		if (!cgrp->parent)
-			continue;
+			break;
+
 		if (--start < buf)
-			return -ENAMETOOLONG;
+			goto out;
 		*start = '/';
+
+		cgrp = cgrp->parent;
 	}
+	ret = 0;
 	memmove(buf, start, buf + buflen - start);
-	return 0;
+out:
+	rcu_read_unlock();
+	return ret;
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
 
@@ -2537,13 +2548,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 			    struct inode *new_dir, struct dentry *new_dentry)
 {
+	int ret;
+	struct cgroup_name *name, *old_name;
+	struct cgroup *cgrp;
+
+	/*
+	 * It's convinient to use parent dir's i_mutex to protected
+	 * cgrp->name.
+	 */
+	lockdep_assert_held(&old_dir->i_mutex);
+
 	if (!S_ISDIR(old_dentry->d_inode->i_mode))
 		return -ENOTDIR;
 	if (new_dentry->d_inode)
 		return -EEXIST;
 	if (old_dir != new_dir)
 		return -EIO;
-	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+	cgrp = __d_cgrp(old_dentry);
+
+	name = cgroup_alloc_name(new_dentry);
+	if (!name)
+		return -ENOMEM;
+
+	ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (ret) {
+		kfree(name);
+		return ret;
+	}
+
+	old_name = cgrp->name;
+	rcu_assign_pointer(cgrp->name, name);
+
+	kfree_rcu(old_name, rcu_head);
+	return 0;
 }
 
 static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -4158,6 +4196,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			     umode_t mode)
 {
 	struct cgroup *cgrp;
+	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
 	int err = 0;
 	struct cgroup_subsys *ss;
@@ -4168,9 +4207,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (!cgrp)
 		return -ENOMEM;
 
+	name = cgroup_alloc_name(dentry);
+	if (!name)
+		goto err_free_cgrp;
+	rcu_assign_pointer(cgrp->name, name);
+
 	cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
 	if (cgrp->id < 0)
-		goto err_free_cgrp;
+		goto err_free_name;
 
 	/*
 	 * Only live parents can have children.  Note that the liveliness
@@ -4276,6 +4320,8 @@ err_free_all:
 	deactivate_super(sb);
 err_free_id:
 	ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_name:
+	kfree(rcu_dereference_raw(cgrp->name));
 err_free_cgrp:
 	kfree(cgrp);
 	return err;
-- 
cgit v1.2.3


From f440d98f8ebab02a768c1de17395e4239af9a97d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 1 Mar 2013 15:02:15 +0800
Subject: cpuset: use cgroup_name() in cpuset_print_task_mems_allowed()

Use cgroup_name() instead of cgrp->dentry->name. This makes the code
a bit simpler.

While at it, remove cpuset_name and make cpuset_nodelist a local variable
to cpuset_print_task_mems_allowed().

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecbd..ace5bfcdcb30 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -264,17 +264,6 @@ static struct cpuset top_cpuset = {
 static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_MUTEX(callback_mutex);
 
-/*
- * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
- * buffers.  They are statically allocated to prevent using excess stack
- * when calling cpuset_print_task_mems_allowed().
- */
-#define CPUSET_NAME_LEN		(128)
-#define	CPUSET_NODELIST_LEN	(256)
-static char cpuset_name[CPUSET_NAME_LEN];
-static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-static DEFINE_SPINLOCK(cpuset_buffer_lock);
-
 /*
  * CPU / memory hotplug is handled asynchronously.
  */
@@ -2592,6 +2581,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
 }
 
+#define CPUSET_NODELIST_LEN	(256)
+
 /**
  * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
  * @task: pointer to task_struct of some task.
@@ -2602,24 +2593,19 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  */
 void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 {
-	struct dentry *dentry;
+	 /* Statically allocated to prevent using excess stack. */
+	static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+	static DEFINE_SPINLOCK(cpuset_buffer_lock);
 
-	dentry = task_cs(tsk)->css.cgroup->dentry;
-	spin_lock(&cpuset_buffer_lock);
+	struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
 
-	if (!dentry) {
-		strcpy(cpuset_name, "/");
-	} else {
-		spin_lock(&dentry->d_lock);
-		strlcpy(cpuset_name, (const char *)dentry->d_name.name,
-			CPUSET_NAME_LEN);
-		spin_unlock(&dentry->d_lock);
-	}
+	spin_lock(&cpuset_buffer_lock);
 
 	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
 			   tsk->mems_allowed);
 	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
-	       tsk->comm, cpuset_name, cpuset_nodelist);
+	       tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
+
 	spin_unlock(&cpuset_buffer_lock);
 }
 
-- 
cgit v1.2.3


From f50daa704f36a6544a902c52b6cf37b0493dfc5d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 1 Mar 2013 15:06:07 +0800
Subject: cgroup: no need to check css refs for release notification

We no longer fail rmdir() when there're still css refs, so we don't
need to check css refs in check_for_release().

This also voids a bug. cgroup_has_css_refs() accesses subsys[i]
without cgroup_mutex, so it can race with cgroup_unload_subsys().

cgroup_has_css_refs()
...
  if (ss == NULL || ss->root != cgrp->root)

if ss pointers to net_cls_subsys, and cls_cgroup module is unloaded
right after the former check but before the latter, the memory that
net_cls_subsys resides has become invalid.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 67 +++++++--------------------------------------------------
 1 file changed, 8 insertions(+), 59 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 50682168abc2..9df799d5d31c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4341,47 +4341,6 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
-/*
- * Check the reference count on each subsystem. Since we already
- * established that there are no tasks in the cgroup, if the css refcount
- * is also 1, then there should be no outstanding references, so the
- * subsystem is safe to destroy. We scan across all subsystems rather than
- * using the per-hierarchy linked list of mounted subsystems since we can
- * be called via check_for_release() with no synchronization other than
- * RCU, and the subsystem linked list isn't RCU-safe.
- */
-static int cgroup_has_css_refs(struct cgroup *cgrp)
-{
-	int i;
-
-	/*
-	 * We won't need to lock the subsys array, because the subsystems
-	 * we're concerned about aren't going anywhere since our cgroup root
-	 * has a reference on them.
-	 */
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup_subsys *ss = subsys[i];
-		struct cgroup_subsys_state *css;
-
-		/* Skip subsystems not present or not in this hierarchy */
-		if (ss == NULL || ss->root != cgrp->root)
-			continue;
-
-		css = cgrp->subsys[ss->subsys_id];
-		/*
-		 * When called from check_for_release() it's possible
-		 * that by this point the cgroup has been removed
-		 * and the css deleted. But a false-positive doesn't
-		 * matter, since it can only happen if the cgroup
-		 * has been deleted and hence no longer needs the
-		 * release agent to be called anyway.
-		 */
-		if (css && css_refcnt(css) > 1)
-			return 1;
-	}
-	return 0;
-}
-
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
@@ -5108,12 +5067,15 @@ static void check_for_release(struct cgroup *cgrp)
 {
 	/* All of these checks rely on RCU to keep the cgroup
 	 * structure alive */
-	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
-	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
-		/* Control Group is currently removeable. If it's not
+	if (cgroup_is_releasable(cgrp) &&
+	    !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
+		/*
+		 * Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
-		 * it now */
+		 * it now
+		 */
 		int need_schedule_work = 0;
+
 		raw_spin_lock(&release_list_lock);
 		if (!cgroup_is_removed(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
@@ -5146,24 +5108,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
 /* Caller must verify that the css is not for root cgroup */
 void __css_put(struct cgroup_subsys_state *css)
 {
-	struct cgroup *cgrp = css->cgroup;
 	int v;
 
-	rcu_read_lock();
 	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-
-	switch (v) {
-	case 1:
-		if (notify_on_release(cgrp)) {
-			set_bit(CGRP_RELEASABLE, &cgrp->flags);
-			check_for_release(cgrp);
-		}
-		break;
-	case 0:
+	if (v == 0)
 		schedule_work(&css->dput_work);
-		break;
-	}
-	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(__css_put);
 
-- 
cgit v1.2.3


From 7d8e0bf56a66bab08d2f316dd87e56c08cecb899 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 10:57:03 +0800
Subject: cgroup: avoid accessing modular cgroup subsys structure without
 locking

subsys[i] is set to NULL in cgroup_unload_subsys() at modular unload,
and that's protected by cgroup_mutex, and then the memory *subsys[i]
resides will be freed.

So this is unsafe without any locking:

  if (!ss || ss->module)
  ...

v2:
- add a comment for enum cgroup_subsys_id
- simplify the comment in cgroup_exit()

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 17 ++++++++++++++---
 kernel/cgroup.c        | 28 ++++++++++++++--------------
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 75c6ec1ba1ba..5f76829dd75e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -44,14 +44,25 @@ extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
 
 extern const struct file_operations proc_cgroup_operations;
 
-/* Define the enumeration of all builtin cgroup subsystems */
+/*
+ * Define the enumeration of all cgroup subsystems.
+ *
+ * We define ids for builtin subsystems and then modular ones.
+ */
 #define SUBSYS(_x) _x ## _subsys_id,
-#define IS_SUBSYS_ENABLED(option) IS_ENABLED(option)
 enum cgroup_subsys_id {
+#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
+#include <linux/cgroup_subsys.h>
+#undef IS_SUBSYS_ENABLED
+	CGROUP_BUILTIN_SUBSYS_COUNT,
+
+	__CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1,
+
+#define IS_SUBSYS_ENABLED(option) IS_MODULE(option)
 #include <linux/cgroup_subsys.h>
+#undef IS_SUBSYS_ENABLED
 	CGROUP_SUBSYS_COUNT,
 };
-#undef IS_SUBSYS_ENABLED
 #undef SUBSYS
 
 /* Per-subsystem/per-cgroup state maintained by the system. */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9df799d5d31c..7a6c4c72ca55 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4940,17 +4940,17 @@ void cgroup_post_fork(struct task_struct *child)
 	 * and addition to css_set.
 	 */
 	if (need_forkexit_callback) {
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		/*
+		 * fork/exit callbacks are supported only for builtin
+		 * subsystems, and the builtin section of the subsys
+		 * array is immutable, so we don't need to lock the
+		 * subsys array here. On the other hand, modular section
+		 * of the array can be freed at module unload, so we
+		 * can't touch that.
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 
-			/*
-			 * fork/exit callbacks are supported only for
-			 * builtin subsystems and we don't need further
-			 * synchronization as they never go away.
-			 */
-			if (!ss || ss->module)
-				continue;
-
 			if (ss->fork)
 				ss->fork(child);
 		}
@@ -5015,13 +5015,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	tsk->cgroups = &init_css_set;
 
 	if (run_callbacks && need_forkexit_callback) {
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		/*
+		 * fork/exit callbacks are supported only for builtin
+		 * subsystems, see cgroup_post_fork() for details.
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 
-			/* modular subsystems can't use callbacks */
-			if (!ss || ss->module)
-				continue;
-
 			if (ss->exit) {
 				struct cgroup *old_cgrp =
 					rcu_dereference_raw(cg->subsys[i])->cgroup;
-- 
cgit v1.2.3


From 9259826ccb8165f797e4c2c9d17925b41af5f6ae Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 11:37:50 +0800
Subject: res_counter: remove include of cgroup.h from res_counter.h

It's not needed at all.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/res_counter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 5ae8456d9670..a83a849bf1d3 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -13,7 +13,7 @@
  * info about what this counter is.
  */
 
-#include <linux/cgroup.h>
+#include <linux/spinlock.h>
 
 /*
  * The core object. the cgroup that wishes to account for some
-- 
cgit v1.2.3


From ff794dea52eaaa09017efea688a1d7f92ab0818e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 11:37:56 +0800
Subject: cpuset: remove include of cgroup.h from cpuset.h

We don't need to include cgroup.h in cpuset.h.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8c8a60d29407..ccd1de8ad822 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -11,7 +11,6 @@
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
-#include <linux/cgroup.h>
 #include <linux/mm.h>
 
 #ifdef CONFIG_CPUSETS
-- 
cgit v1.2.3


From cfb5966bef85412dab9c93553db10b3e99ac32e8 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 12 Mar 2013 10:28:39 +0800
Subject: cpuset: fix RCU lockdep splat in cpuset_print_task_mems_allowed()

Sasha reported a lockdep warning when OOM was triggered. The reason
is cgroup_name() should be called with rcu_read_lock() held.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ace5bfcdcb30..efbfca7a33e4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2599,6 +2599,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 
 	struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
 
+	rcu_read_lock();
 	spin_lock(&cpuset_buffer_lock);
 
 	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
@@ -2607,6 +2608,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 	       tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
 
 	spin_unlock(&cpuset_buffer_lock);
+	rcu_read_unlock();
 }
 
 /*
-- 
cgit v1.2.3


From e7b2dcc52b0e2d598a469f01cc460ccdde6869f2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 12 Mar 2013 15:35:58 -0700
Subject: cgroup: remove cgroup_is_descendant()

It was used by ns cgroup, and ns cgroup was removed long ago.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  3 ---
 kernel/cgroup.c        | 28 ----------------------------
 2 files changed, 31 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5f76829dd75e..7e818a3ef60a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -448,9 +448,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
-/* Return true if cgrp is a descendant of the task's cgroup */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
-
 /*
  * Control Group taskset, used to pass around set of tasks to cgroup_subsys
  * methods.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7a6c4c72ca55..f51443fd5f71 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5035,34 +5035,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	put_css_set_taskexit(cg);
 }
 
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
-	int ret;
-	struct cgroup *target;
-
-	if (cgrp == dummytop)
-		return 1;
-
-	target = task_cgroup_from_root(task, cgrp->root);
-	while (cgrp != target && cgrp!= cgrp->top_cgroup)
-		cgrp = cgrp->parent;
-	ret = (cgrp == target);
-	return ret;
-}
-
 static void check_for_release(struct cgroup *cgrp)
 {
 	/* All of these checks rely on RCU to keep the cgroup
-- 
cgit v1.2.3


From 6dc01181eac16192dc4a5d1b310b78e2e97c003c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 12 Mar 2013 15:35:58 -0700
Subject: cgroup: remove unused variables in cgroup_destroy_locked()

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f51443fd5f71..fd0b056d8da5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4346,10 +4346,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 {
 	struct dentry *d = cgrp->dentry;
 	struct cgroup *parent = cgrp->parent;
-	DEFINE_WAIT(wait);
 	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
-	LIST_HEAD(tmp_list);
 
 	lockdep_assert_held(&d->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_mutex);
-- 
cgit v1.2.3


From d7eeac1913ff86a17f891cb4b73f03d4b94907d0 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 12 Mar 2013 15:35:59 -0700
Subject: cgroup: hold cgroup_mutex before calling css_offline()

cpuset no longer nests cgroup_mutex inside cpu_hotplug lock, so
we don't have to release cgroup_mutex before calling css_offline().

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroups/cgroups.txt |  1 +
 kernel/cgroup.c                   | 11 +----------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index bcf1a00b06a1..0028e888828c 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -580,6 +580,7 @@ propagation along the hierarchy. See the comment on
 cgroup_for_each_descendant_pre() for details.
 
 void css_offline(struct cgroup *cgrp);
+(cgroup_mutex held by caller)
 
 This is the counterpart of css_online() and called iff css_online()
 has succeeded on @cgrp. This signifies the beginning of the end of
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fd0b056d8da5..49297cbc134d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4169,17 +4169,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	if (!(css->flags & CSS_ONLINE))
 		return;
 
-	/*
-	 * css_offline() should be called with cgroup_mutex unlocked.  See
-	 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
-	 * details.  This temporary unlocking should go away once
-	 * cgroup_mutex is unexported from controllers.
-	 */
-	if (ss->css_offline) {
-		mutex_unlock(&cgroup_mutex);
+	if (ss->css_offline)
 		ss->css_offline(cgrp);
-		mutex_lock(&cgroup_mutex);
-	}
 
 	cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
 }
-- 
cgit v1.2.3


From 6ee211ad0a22869af81eef10845922ac4dcb2d38 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 12 Mar 2013 15:36:00 -0700
Subject: cgroup: don't bother to resize pid array

When we open cgroup.procs, we'll allocate an buffer and store all tasks'
tgid in it, and then duplicate entries will be stripped. If that results
in a much smaller pid list, we'll re-allocate a smaller buffer.

But we've already sucessfully allocated memory and reading the procs
file is a short period and the memory will be freed very soon, so why
bother to re-allocate memory.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 37 +++----------------------------------
 1 file changed, 3 insertions(+), 34 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 49297cbc134d..29c77a714731 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3400,35 +3400,14 @@ static void pidlist_free(void *p)
 	else
 		kfree(p);
 }
-static void *pidlist_resize(void *p, int newcount)
-{
-	void *newlist;
-	/* note: if new alloc fails, old p will still be valid either way */
-	if (is_vmalloc_addr(p)) {
-		newlist = vmalloc(newcount * sizeof(pid_t));
-		if (!newlist)
-			return NULL;
-		memcpy(newlist, p, newcount * sizeof(pid_t));
-		vfree(p);
-	} else {
-		newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
-	}
-	return newlist;
-}
 
 /*
  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * If the new stripped list is sufficiently smaller and there's enough memory
- * to allocate a new buffer, will let go of the unneeded memory. Returns the
- * number of unique elements.
+ * Returns the number of unique elements.
  */
-/* is the size difference enough that we should re-allocate the array? */
-#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
-static int pidlist_uniq(pid_t **p, int length)
+static int pidlist_uniq(pid_t *list, int length)
 {
 	int src, dest = 1;
-	pid_t *list = *p;
-	pid_t *newlist;
 
 	/*
 	 * we presume the 0th element is unique, so i starts at 1. trivial
@@ -3449,16 +3428,6 @@ static int pidlist_uniq(pid_t **p, int length)
 		dest++;
 	}
 after:
-	/*
-	 * if the length difference is large enough, we want to allocate a
-	 * smaller buffer to save memory. if this fails due to out of memory,
-	 * we'll just stay with what we've got.
-	 */
-	if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
-		newlist = pidlist_resize(list, dest);
-		if (newlist)
-			*p = newlist;
-	}
 	return dest;
 }
 
@@ -3554,7 +3523,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	/* now sort & (if procs) strip out duplicates */
 	sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
-		length = pidlist_uniq(&array, length);
+		length = pidlist_uniq(array, length);
 	l = cgroup_pidlist_find(cgrp, type);
 	if (!l) {
 		pidlist_free(array);
-- 
cgit v1.2.3


From 80f36c2a1a612ca419e5b864a7e4808e797d9feb Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 12 Mar 2013 15:36:00 -0700
Subject: cgroup: remove useless code in cgroup_write_event_control()

eventfd_poll() never returns POLLHUP.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 29c77a714731..c7fe303b5109 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3937,12 +3937,6 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
 	if (ret)
 		goto fail;
 
-	if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
-		event->cft->unregister_event(cgrp, event->cft, event->eventfd);
-		ret = 0;
-		goto fail;
-	}
-
 	/*
 	 * Events should be removed after rmdir of cgroup directory, but before
 	 * destroying subsystem state objects. Let's take reference to cgroup
-- 
cgit v1.2.3


From 3ac1707a13a3da9cfc8f242a15b2fae6df2c5f88 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 12 Mar 2013 15:36:00 -0700
Subject: cgroup: fix an off-by-one bug which may trigger BUG_ON()

The 3rd parameter of flex_array_prealloc() is the number of elements,
not the index of the last element.

The effect of the bug is, when opening cgroup.procs, a flex array will
be allocated and all elements of the array is allocated with
GFP_KERNEL flag, but the last one is GFP_ATOMIC, and if we fail to
allocate memory for it, it'll trigger a BUG_ON().

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c7fe303b5109..54689fc008f6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2076,7 +2076,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	if (!group)
 		return -ENOMEM;
 	/* pre-allocate to guarantee space while iterating in rcu read-side. */
-	retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+	retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
 	if (retval)
 		goto out_free_group_list;
 
-- 
cgit v1.2.3


From 26898fdff371d78f122cf15d8732d1d37f2d1338 Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Fri, 15 Feb 2013 11:55:44 -0500
Subject: devcg: expand may_access() logic

In order to make the next patch more clear, expand may_access() logic.

v2: may_access() returns bool now

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 security/device_cgroup.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 1c69e38e3a2c..4ad55a9c6920 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -305,8 +305,8 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
  * @dev_cgroup: dev cgroup to be tested against
  * @refex: new exception
  */
-static int may_access(struct dev_cgroup *dev_cgroup,
-		      struct dev_exception_item *refex)
+static bool may_access(struct dev_cgroup *dev_cgroup,
+		       struct dev_exception_item *refex)
 {
 	struct dev_exception_item *ex;
 	bool match = false;
@@ -332,16 +332,19 @@ static int may_access(struct dev_cgroup *dev_cgroup,
 
 	/*
 	 * In two cases we'll consider this new exception valid:
-	 * - the dev cgroup has its default policy to allow + exception list:
-	 *   the new exception should *not* match any of the exceptions
-	 *   (behavior == DEVCG_DEFAULT_ALLOW, !match)
 	 * - the dev cgroup has its default policy to deny + exception list:
 	 *   the new exception *should* match the exceptions
-	 *   (behavior == DEVCG_DEFAULT_DENY, match)
+	 * - the dev cgroup has its default policy to allow + exception list:
+	 *   the new exception should *not* match any of the exceptions
 	 */
-	if ((dev_cgroup->behavior == DEVCG_DEFAULT_DENY) == match)
-		return 1;
-	return 0;
+	if (dev_cgroup->behavior == DEVCG_DEFAULT_DENY) {
+		if (match)
+			return true;
+	} else {
+		if (!match)
+			return true;
+	}
+	return false;
 }
 
 /*
-- 
cgit v1.2.3


From c39a2a3018f8065cb5ea38b0314c1bbedb2cfa0d Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Fri, 15 Feb 2013 11:55:45 -0500
Subject: devcg: prepare may_access() for hierarchy support

Currently may_access() is only able to verify if an exception is valid for the
current cgroup, which has the same behavior. With hierarchy, it'll be also used
to verify if a cgroup local exception is valid towards its cgroup parent, which
might have different behavior.

v2:
- updated patch description
- rebased on top of a new patch to expand the may_access() logic to make it
  more clear
- fixed argument description order in may_access()

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 security/device_cgroup.c | 49 ++++++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 4ad55a9c6920..ddf3c709c0c2 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -25,6 +25,12 @@
 
 static DEFINE_MUTEX(devcgroup_mutex);
 
+enum devcg_behavior {
+	DEVCG_DEFAULT_NONE,
+	DEVCG_DEFAULT_ALLOW,
+	DEVCG_DEFAULT_DENY,
+};
+
 /*
  * exception list locking rules:
  * hold devcgroup_mutex for update/read.
@@ -42,10 +48,7 @@ struct dev_exception_item {
 struct dev_cgroup {
 	struct cgroup_subsys_state css;
 	struct list_head exceptions;
-	enum {
-		DEVCG_DEFAULT_ALLOW,
-		DEVCG_DEFAULT_DENY,
-	} behavior;
+	enum devcg_behavior behavior;
 };
 
 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
@@ -304,9 +307,11 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
  *		verify if a certain access is allowed.
  * @dev_cgroup: dev cgroup to be tested against
  * @refex: new exception
+ * @behavior: behavior of the exception
  */
 static bool may_access(struct dev_cgroup *dev_cgroup,
-		       struct dev_exception_item *refex)
+		       struct dev_exception_item *refex,
+		       enum devcg_behavior behavior)
 {
 	struct dev_exception_item *ex;
 	bool match = false;
@@ -330,19 +335,27 @@ static bool may_access(struct dev_cgroup *dev_cgroup,
 		break;
 	}
 
-	/*
-	 * In two cases we'll consider this new exception valid:
-	 * - the dev cgroup has its default policy to deny + exception list:
-	 *   the new exception *should* match the exceptions
-	 * - the dev cgroup has its default policy to allow + exception list:
-	 *   the new exception should *not* match any of the exceptions
-	 */
-	if (dev_cgroup->behavior == DEVCG_DEFAULT_DENY) {
-		if (match)
+	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
+		if (behavior == DEVCG_DEFAULT_ALLOW) {
+			/* the exception will deny access to certain devices */
+			return true;
+		} else {
+			/* the exception will allow access to certain devices */
+			if (match)
+				/*
+				 * a new exception allowing access shouldn't
+				 * match an parent's exception
+				 */
+				return false;
 			return true;
+		}
 	} else {
-		if (!match)
+		/* only behavior == DEVCG_DEFAULT_DENY allowed here */
+		if (match)
+			/* parent has an exception that matches the proposed */
 			return true;
+		else
+			return false;
 	}
 	return false;
 }
@@ -361,7 +374,7 @@ static int parent_has_perm(struct dev_cgroup *childcg,
 	if (!pcg)
 		return 1;
 	parent = cgroup_to_devcgroup(pcg);
-	return may_access(parent, ex);
+	return may_access(parent, ex, childcg->behavior);
 }
 
 /**
@@ -395,7 +408,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 {
 	const char *b;
 	char temp[12];		/* 11 + 1 characters needed for a u32 */
-	int count, rc;
+	int count, rc = 0;
 	struct dev_exception_item ex;
 	struct cgroup *p = devcgroup->css.cgroup;
 	struct dev_cgroup *parent = NULL;
@@ -612,7 +625,7 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
 
 	rcu_read_lock();
 	dev_cgroup = task_devcgroup(current);
-	rc = may_access(dev_cgroup, &ex);
+	rc = may_access(dev_cgroup, &ex, dev_cgroup->behavior);
 	rcu_read_unlock();
 
 	if (!rc)
-- 
cgit v1.2.3


From 1909554c9715e4d032497993bb56f2726bfa89ae Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Fri, 15 Feb 2013 11:55:46 -0500
Subject: devcg: use css_online and css_offline

Allocate resources and change behavior only when online. This is needed in
order to determine if a node is suitable for hierarchy propagation or if it's
being removed.

Locking:
Both functions take devcgroup_mutex to make changes to device_cgroup structure.
Hierarchy propagation will also take devcgroup_mutex before walking the
tree while walking the tree itself is protected by rcu lock.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 security/device_cgroup.c | 59 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index ddf3c709c0c2..16c9e1069be6 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -185,36 +185,59 @@ static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
 	__dev_exception_clean(dev_cgroup);
 }
 
+/**
+ * devcgroup_online - initializes devcgroup's behavior and exceptions based on
+ * 		      parent's
+ * @cgroup: cgroup getting online
+ * returns 0 in case of success, error code otherwise
+ */
+static int devcgroup_online(struct cgroup *cgroup)
+{
+	struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL;
+	int ret = 0;
+
+	mutex_lock(&devcgroup_mutex);
+	dev_cgroup = cgroup_to_devcgroup(cgroup);
+	if (cgroup->parent)
+		parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent);
+
+	if (parent_dev_cgroup == NULL)
+		dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
+	else {
+		ret = dev_exceptions_copy(&dev_cgroup->exceptions,
+					  &parent_dev_cgroup->exceptions);
+		if (!ret)
+			dev_cgroup->behavior = parent_dev_cgroup->behavior;
+	}
+	mutex_unlock(&devcgroup_mutex);
+
+	return ret;
+}
+
+static void devcgroup_offline(struct cgroup *cgroup)
+{
+	struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup);
+
+	mutex_lock(&devcgroup_mutex);
+	dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
+	mutex_unlock(&devcgroup_mutex);
+}
+
 /*
  * called from kernel/cgroup.c with cgroup_lock() held.
  */
 static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
 {
-	struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
+	struct dev_cgroup *dev_cgroup;
 	struct cgroup *parent_cgroup;
-	int ret;
 
 	dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
 	if (!dev_cgroup)
 		return ERR_PTR(-ENOMEM);
 	INIT_LIST_HEAD(&dev_cgroup->exceptions);
+	dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
 	parent_cgroup = cgroup->parent;
 
-	if (parent_cgroup == NULL)
-		dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
-	else {
-		parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
-		mutex_lock(&devcgroup_mutex);
-		ret = dev_exceptions_copy(&dev_cgroup->exceptions,
-					  &parent_dev_cgroup->exceptions);
-		dev_cgroup->behavior = parent_dev_cgroup->behavior;
-		mutex_unlock(&devcgroup_mutex);
-		if (ret) {
-			kfree(dev_cgroup);
-			return ERR_PTR(ret);
-		}
-	}
-
 	return &dev_cgroup->css;
 }
 
@@ -587,6 +610,8 @@ struct cgroup_subsys devices_subsys = {
 	.can_attach = devcgroup_can_attach,
 	.css_alloc = devcgroup_css_alloc,
 	.css_free = devcgroup_css_free,
+	.css_online = devcgroup_online,
+	.css_offline = devcgroup_offline,
 	.subsys_id = devices_subsys_id,
 	.base_cftypes = dev_cgroup_files,
 
-- 
cgit v1.2.3


From bd2953ebbb533aeda9b86c82a53d5197a9a38f1b Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Fri, 15 Feb 2013 11:55:47 -0500
Subject: devcg: propagate local changes down the hierarchy

This patch makes exception changes to propagate down in hierarchy respecting
when possible local exceptions.

New exceptions allowing additional access to devices won't be propagated, but
it'll be possible to add an exception to access all of part of the newly
allowed device(s).

New exceptions disallowing access to devices will be propagated down and the
local group's exceptions will be revalidated for the new situation.
Example:
      A
     / \
        B

    group        behavior          exceptions
    A            allow             "b 8:* rwm", "c 116:1 rw"
    B            deny              "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"

If a new exception is added to group A:
	# echo "c 116:* r" > A/devices.deny
it'll propagate down and after revalidating B's local exceptions, the exception
"c 116:2 rwm" will be removed.

In case parent's exceptions change and local exceptions are not allowed anymore,
they'll be deleted.

v7:
- do not allow behavior change when the cgroup has children
- update documentation

v6: fixed issues pointed by Serge Hallyn
- only copy parent's exceptions while propagating behavior if the local
  behavior is different
- while propagating exceptions, do not clear and copy parent's: it'd be against
  the premise we don't propagate access to more devices

v5: fixed issues pointed by Serge Hallyn
- updated documentation
- not propagating when an exception is written to devices.allow
- when propagating a new behavior, clean the local exceptions list if they're
  for a different behavior

v4: fixed issues pointed by Tejun Heo
- separated function to walk the tree and collect valid propagation targets

v3: fixed issues pointed by Tejun Heo
- update documentation
- move css_online/css_offline changes to a new patch
- use cgroup_for_each_descendant_pre() instead of own descendant walk
- move exception_copy rework to a separared patch
- move exception_clean rework to a separated patch

v2: fixed issues pointed by Tejun Heo
- instead of keeping the local settings that won't apply anymore, remove them

Cc: Tejun Heo <tj@kernel.org>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroups/devices.txt |  70 ++++++++++++++++++-
 security/device_cgroup.c          | 139 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 199 insertions(+), 10 deletions(-)

diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt
index 16624a7f8222..3c1095ca02ea 100644
--- a/Documentation/cgroups/devices.txt
+++ b/Documentation/cgroups/devices.txt
@@ -13,9 +13,7 @@ either an integer or * for all.  Access is a composition of r
 The root device cgroup starts with rwm to 'all'.  A child device
 cgroup gets a copy of the parent.  Administrators can then remove
 devices from the whitelist or add new entries.  A child cgroup can
-never receive a device access which is denied by its parent.  However
-when a device access is removed from a parent it will not also be
-removed from the child(ren).
+never receive a device access which is denied by its parent.
 
 2. User Interface
 
@@ -50,3 +48,69 @@ task to a new cgroup.  (Again we'll probably want to change that).
 
 A cgroup may not be granted more permissions than the cgroup's
 parent has.
+
+4. Hierarchy
+
+device cgroups maintain hierarchy by making sure a cgroup never has more
+access permissions than its parent.  Every time an entry is written to
+a cgroup's devices.deny file, all its children will have that entry removed
+from their whitelist and all the locally set whitelist entries will be
+re-evaluated.  In case one of the locally set whitelist entries would provide
+more access than the cgroup's parent, it'll be removed from the whitelist.
+
+Example:
+      A
+     / \
+        B
+
+    group        behavior	exceptions
+    A            allow		"b 8:* rwm", "c 116:1 rw"
+    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
+
+If a device is denied in group A:
+	# echo "c 116:* r" > A/devices.deny
+it'll propagate down and after revalidating B's entries, the whitelist entry
+"c 116:2 rwm" will be removed:
+
+    group        whitelist entries                        denied devices
+    A            all                                      "b 8:* rwm", "c 116:* rw"
+    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
+
+In case parent's exceptions change and local exceptions are not allowed
+anymore, they'll be deleted.
+
+Notice that new whitelist entries will not be propagated:
+      A
+     / \
+        B
+
+    group        whitelist entries                        denied devices
+    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+when adding "c *:3 rwm":
+	# echo "c *:3 rwm" >A/devices.allow
+
+the result:
+    group        whitelist entries                        denied devices
+    A            "c *:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+but now it'll be possible to add new entries to B:
+	# echo "c 2:3 rwm" >B/devices.allow
+	# echo "c 50:3 r" >B/devices.allow
+or even
+	# echo "c *:3 rwm" >B/devices.allow
+
+Allowing or denying all by writing 'a' to devices.allow or devices.deny will
+not be possible once the device cgroups has children.
+
+4.1 Hierarchy (internal implementation)
+
+device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
+list of exceptions.  The internal state is controlled using the same user
+interface to preserve compatibility with the previous whitelist-only
+implementation.  Removal or addition of exceptions that will reduce the access
+to devices will be propagated down the hierarchy.
+For every propagated exception, the effective rules will be re-evaluated based
+on current parent's access rules.
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 16c9e1069be6..221967d4690c 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -49,6 +49,8 @@ struct dev_cgroup {
 	struct cgroup_subsys_state css;
 	struct list_head exceptions;
 	enum devcg_behavior behavior;
+	/* temporary list for pending propagation operations */
+	struct list_head propagate_pending;
 };
 
 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
@@ -185,6 +187,11 @@ static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
 	__dev_exception_clean(dev_cgroup);
 }
 
+static inline bool is_devcg_online(const struct dev_cgroup *devcg)
+{
+	return (devcg->behavior != DEVCG_DEFAULT_NONE);
+}
+
 /**
  * devcgroup_online - initializes devcgroup's behavior and exceptions based on
  * 		      parent's
@@ -235,6 +242,7 @@ static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
 	if (!dev_cgroup)
 		return ERR_PTR(-ENOMEM);
 	INIT_LIST_HEAD(&dev_cgroup->exceptions);
+	INIT_LIST_HEAD(&dev_cgroup->propagate_pending);
 	dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
 	parent_cgroup = cgroup->parent;
 
@@ -413,6 +421,111 @@ static inline int may_allow_all(struct dev_cgroup *parent)
 	return parent->behavior == DEVCG_DEFAULT_ALLOW;
 }
 
+/**
+ * revalidate_active_exceptions - walks through the active exception list and
+ * 				  revalidates the exceptions based on parent's
+ * 				  behavior and exceptions. The exceptions that
+ * 				  are no longer valid will be removed.
+ * 				  Called with devcgroup_mutex held.
+ * @devcg: cgroup which exceptions will be checked
+ *
+ * This is one of the three key functions for hierarchy implementation.
+ * This function is responsible for re-evaluating all the cgroup's active
+ * exceptions due to a parent's exception change.
+ * Refer to Documentation/cgroups/devices.txt for more details.
+ */
+static void revalidate_active_exceptions(struct dev_cgroup *devcg)
+{
+	struct dev_exception_item *ex;
+	struct list_head *this, *tmp;
+
+	list_for_each_safe(this, tmp, &devcg->exceptions) {
+		ex = container_of(this, struct dev_exception_item, list);
+		if (!parent_has_perm(devcg, ex))
+			dev_exception_rm(devcg, ex);
+	}
+}
+
+/**
+ * get_online_devcg - walks the cgroup tree and fills a list with the online
+ * 		      groups
+ * @root: cgroup used as starting point
+ * @online: list that will be filled with online groups
+ *
+ * Must be called with devcgroup_mutex held. Grabs RCU lock.
+ * Because devcgroup_mutex is held, no devcg will become online or offline
+ * during the tree walk (see devcgroup_online, devcgroup_offline)
+ * A separated list is needed because propagate_behavior() and
+ * propagate_exception() need to allocate memory and can block.
+ */
+static void get_online_devcg(struct cgroup *root, struct list_head *online)
+{
+	struct cgroup *pos;
+	struct dev_cgroup *devcg;
+
+	lockdep_assert_held(&devcgroup_mutex);
+
+	rcu_read_lock();
+	cgroup_for_each_descendant_pre(pos, root) {
+		devcg = cgroup_to_devcgroup(pos);
+		if (is_devcg_online(devcg))
+			list_add_tail(&devcg->propagate_pending, online);
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * propagate_exception - propagates a new exception to the children
+ * @devcg_root: device cgroup that added a new exception
+ * @ex: new exception to be propagated
+ *
+ * returns: 0 in case of success, != 0 in case of error
+ */
+static int propagate_exception(struct dev_cgroup *devcg_root,
+			       struct dev_exception_item *ex)
+{
+	struct cgroup *root = devcg_root->css.cgroup;
+	struct dev_cgroup *devcg, *parent, *tmp;
+	int rc = 0;
+	LIST_HEAD(pending);
+
+	get_online_devcg(root, &pending);
+
+	list_for_each_entry_safe(devcg, tmp, &pending, propagate_pending) {
+		parent = cgroup_to_devcgroup(devcg->css.cgroup->parent);
+
+		/*
+		 * in case both root's behavior and devcg is allow, a new
+		 * restriction means adding to the exception list
+		 */
+		if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW &&
+		    devcg->behavior == DEVCG_DEFAULT_ALLOW) {
+			rc = dev_exception_add(devcg, ex);
+			if (rc)
+				break;
+		} else {
+			/*
+			 * in the other possible cases:
+			 * root's behavior: allow, devcg's: deny
+			 * root's behavior: deny, devcg's: deny
+			 * the exception will be removed
+			 */
+			dev_exception_rm(devcg, ex);
+		}
+		revalidate_active_exceptions(devcg);
+
+		list_del_init(&devcg->propagate_pending);
+	}
+	return rc;
+}
+
+static inline bool has_children(struct dev_cgroup *devcgroup)
+{
+	struct cgroup *cgrp = devcgroup->css.cgroup;
+
+	return !list_empty(&cgrp->children);
+}
+
 /*
  * Modify the exception list using allow/deny rules.
  * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
@@ -449,6 +562,9 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	case 'a':
 		switch (filetype) {
 		case DEVCG_ALLOW:
+			if (has_children(devcgroup))
+				return -EINVAL;
+
 			if (!may_allow_all(parent))
 				return -EPERM;
 			dev_exception_clean(devcgroup);
@@ -462,6 +578,9 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 				return rc;
 			break;
 		case DEVCG_DENY:
+			if (has_children(devcgroup))
+				return -EINVAL;
+
 			dev_exception_clean(devcgroup);
 			devcgroup->behavior = DEVCG_DEFAULT_DENY;
 			break;
@@ -556,22 +675,28 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 			dev_exception_rm(devcgroup, &ex);
 			return 0;
 		}
-		return dev_exception_add(devcgroup, &ex);
+		rc = dev_exception_add(devcgroup, &ex);
+		break;
 	case DEVCG_DENY:
 		/*
 		 * If the default policy is to deny by default, try to remove
 		 * an matching exception instead. And be silent about it: we
 		 * don't want to break compatibility
 		 */
-		if (devcgroup->behavior == DEVCG_DEFAULT_DENY) {
+		if (devcgroup->behavior == DEVCG_DEFAULT_DENY)
 			dev_exception_rm(devcgroup, &ex);
-			return 0;
-		}
-		return dev_exception_add(devcgroup, &ex);
+		else
+			rc = dev_exception_add(devcgroup, &ex);
+
+		if (rc)
+			break;
+		/* we only propagate new restrictions */
+		rc = propagate_exception(devcgroup, &ex);
+		break;
 	default:
-		return -EINVAL;
+		rc = -EINVAL;
 	}
-	return 0;
+	return rc;
 }
 
 static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft,
-- 
cgit v1.2.3


From 081aa458c38ba576bdd4265fc807fa95b48b9e79 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Wed, 13 Mar 2013 09:17:09 +0800
Subject: cgroup: consolidate cgroup_attach_task() and cgroup_attach_proc()

These two functions share most of the code.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |   3 +-
 kernel/cgroup.c        | 109 +++++++++----------------------------------------
 kernel/cpuset.c        |   2 +-
 3 files changed, 23 insertions(+), 91 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7e818a3ef60a..01c48c6806d6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -693,7 +693,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
-int cgroup_attach_task(struct cgroup *, struct task_struct *);
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+		       bool threadgroup);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 54689fc008f6..04fa2abf94b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -59,7 +59,7 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
-#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
 
 #include <linux/atomic.h>
@@ -1943,82 +1943,6 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	put_css_set(oldcg);
 }
 
-/**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
- *
- * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * @tsk during call.
- */
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
-	int retval = 0;
-	struct cgroup_subsys *ss, *failed_ss = NULL;
-	struct cgroup *oldcgrp;
-	struct cgroupfs_root *root = cgrp->root;
-	struct cgroup_taskset tset = { };
-	struct css_set *newcg;
-
-	/* @tsk either already exited or can't exit until the end */
-	if (tsk->flags & PF_EXITING)
-		return -ESRCH;
-
-	/* Nothing to do if the task is already in that cgroup */
-	oldcgrp = task_cgroup_from_root(tsk, root);
-	if (cgrp == oldcgrp)
-		return 0;
-
-	tset.single.task = tsk;
-	tset.single.cgrp = oldcgrp;
-
-	for_each_subsys(root, ss) {
-		if (ss->can_attach) {
-			retval = ss->can_attach(cgrp, &tset);
-			if (retval) {
-				/*
-				 * Remember on which subsystem the can_attach()
-				 * failed, so that we only call cancel_attach()
-				 * against the subsystems whose can_attach()
-				 * succeeded. (See below)
-				 */
-				failed_ss = ss;
-				goto out;
-			}
-		}
-	}
-
-	newcg = find_css_set(tsk->cgroups, cgrp);
-	if (!newcg) {
-		retval = -ENOMEM;
-		goto out;
-	}
-
-	cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
-
-	for_each_subsys(root, ss) {
-		if (ss->attach)
-			ss->attach(cgrp, &tset);
-	}
-
-out:
-	if (retval) {
-		for_each_subsys(root, ss) {
-			if (ss == failed_ss)
-				/*
-				 * This subsystem was the one that failed the
-				 * can_attach() check earlier, so we don't need
-				 * to call cancel_attach() against it or any
-				 * remaining subsystems.
-				 */
-				break;
-			if (ss->cancel_attach)
-				ss->cancel_attach(cgrp, &tset);
-		}
-	}
-	return retval;
-}
-
 /**
  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  * @from: attach to all cgroups of a given task
@@ -2033,7 +1957,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	for_each_active_root(root) {
 		struct cgroup *from_cg = task_cgroup_from_root(from, root);
 
-		retval = cgroup_attach_task(from_cg, tsk);
+		retval = cgroup_attach_task(from_cg, tsk, false);
 		if (retval)
 			break;
 	}
@@ -2044,21 +1968,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 
 /**
- * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
  * @cgrp: the cgroup to attach to
- * @leader: the threadgroup leader task_struct of the group to be attached
+ * @tsk: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
  *
  * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of each thread in leader's threadgroup individually in turn.
+ * task_lock of @tsk or each thread in the threadgroup individually in turn.
  */
-static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+		       bool threadgroup)
 {
 	int retval, i, group_size;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
-	/* guaranteed to be initialized later, but the compiler needs this */
 	struct cgroupfs_root *root = cgrp->root;
 	/* threadgroup list cursor and array */
-	struct task_struct *tsk;
+	struct task_struct *leader = tsk;
 	struct task_and_cgroup *tc;
 	struct flex_array *group;
 	struct cgroup_taskset tset = { };
@@ -2070,7 +1995,10 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 * group - group_rwsem prevents new threads from appearing, and if
 	 * threads exit, this will just be an over-estimate.
 	 */
-	group_size = get_nr_threads(leader);
+	if (threadgroup)
+		group_size = get_nr_threads(tsk);
+	else
+		group_size = 1;
 	/* flex_array supports very large thread-groups better than kmalloc. */
 	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
 	if (!group)
@@ -2080,7 +2008,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	if (retval)
 		goto out_free_group_list;
 
-	tsk = leader;
 	i = 0;
 	/*
 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2109,6 +2036,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
 		BUG_ON(retval != 0);
 		i++;
+
+		if (!threadgroup)
+			break;
 	} while_each_thread(leader, tsk);
 	rcu_read_unlock();
 	/* remember the number of threads in the array for later. */
@@ -2262,9 +2192,10 @@ retry_find_task:
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
-		ret = cgroup_attach_proc(cgrp, tsk);
-	} else
-		ret = cgroup_attach_task(cgrp, tsk);
+	}
+
+	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
 	threadgroup_unlock(tsk);
 
 	put_task_struct(tsk);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index efbfca7a33e4..98d458aad789 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2008,7 +2008,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 	struct cgroup *new_cgroup = scan->data;
 
 	cgroup_lock();
-	cgroup_attach_task(new_cgroup, tsk);
+	cgroup_attach_task(new_cgroup, tsk, false);
 	cgroup_unlock();
 }
 
-- 
cgit v1.2.3


From 1ae65ae92d77542f81c68269bc2f15bcf1ee4d6a Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Wed, 3 Apr 2013 20:32:04 +0300
Subject: cgroups: Documentation/cgroup/cgroup.txt - a trivial fix.

This trivial patch removes a word which appears twice in
Documentation/cgroup/cgroup.txt.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroups/cgroups.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 0028e888828c..638bf17ff869 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -442,7 +442,7 @@ You can attach the current shell task by echoing 0:
 You can use the cgroup.procs file instead of the tasks file to move all
 threads in a threadgroup at once. Echoing the PID of any task in a
 threadgroup to cgroup.procs causes all tasks in that threadgroup to be
-be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
 in the writing task's threadgroup.
 
 Note: Since every task is always a member of exactly one cgroup in each
-- 
cgit v1.2.3


From 1e2ccd1c0f67c3f958d6139de2496787b9a57182 Mon Sep 17 00:00:00 2001
From: Kevin Wilson <wkevils@gmail.com>
Date: Mon, 1 Apr 2013 10:51:37 +0300
Subject: cgroup: remove unused parameter in cgroup_task_migrate().

This patch removes unused parameter from cgroup_task_migrate().

Signed-off-by: Kevin Wilson <wkevils@gmail.com>
Acked-by: Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 04fa2abf94b2..4aee5bdd66c8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1911,7 +1911,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
  *
  * Must be called with cgroup_mutex and threadgroup locked.
  */
-static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+static void cgroup_task_migrate(struct cgroup *oldcgrp,
 				struct task_struct *tsk, struct css_set *newcg)
 {
 	struct css_set *oldcg;
@@ -2084,7 +2084,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	 */
 	for (i = 0; i < group_size; i++) {
 		tc = flex_array_get(group, i);
-		cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
+		cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
 	}
 	/* nothing is sensitive to fork() after this point. */
 
-- 
cgit v1.2.3


From d9c10ddddc98db0a316243cd266c466875975a94 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 28 Mar 2013 08:48:14 +0100
Subject: memcg: fix memcg_cache_name() to use cgroup_name()

As cgroup supports rename, it's unsafe to dereference dentry->d_name
without proper vfs locks. Fix this by using cgroup_name() rather than
dentry directly.

Also open code memcg_cache_name because it is called only from
kmem_cache_dup which frees the returned name right after
kmem_cache_create_memcg makes a copy of it. Such a short-lived
allocation doesn't make too much sense. So replace it by a static
buffer as kmem_cache_dup is called with memcg_cache_mutex.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/memcontrol.c | 63 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53b8201b31eb..9715c0c491b0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3214,52 +3214,53 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
 	schedule_work(&cachep->memcg_params->destroy);
 }
 
-static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
-{
-	char *name;
-	struct dentry *dentry;
-
-	rcu_read_lock();
-	dentry = rcu_dereference(memcg->css.cgroup->dentry);
-	rcu_read_unlock();
-
-	BUG_ON(dentry == NULL);
-
-	name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
-			 memcg_cache_id(memcg), dentry->d_name.name);
-
-	return name;
-}
+/*
+ * This lock protects updaters, not readers. We want readers to be as fast as
+ * they can, and they will either see NULL or a valid cache value. Our model
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
+ * We need this lock because multiple allocations to the same cache from a non
+ * will span more than one worker. Only one of them can create the cache.
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
 
+/*
+ * Called with memcg_cache_mutex held
+ */
 static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
 					 struct kmem_cache *s)
 {
-	char *name;
 	struct kmem_cache *new;
+	static char *tmp_name = NULL;
 
-	name = memcg_cache_name(memcg, s);
-	if (!name)
-		return NULL;
+	lockdep_assert_held(&memcg_cache_mutex);
+
+	/*
+	 * kmem_cache_create_memcg duplicates the given name and
+	 * cgroup_name for this name requires RCU context.
+	 * This static temporary buffer is used to prevent from
+	 * pointless shortliving allocation.
+	 */
+	if (!tmp_name) {
+		tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
+		if (!tmp_name)
+			return NULL;
+	}
+
+	rcu_read_lock();
+	snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
+			 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
+	rcu_read_unlock();
 
-	new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+	new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
 				      (s->flags & ~SLAB_PANIC), s->ctor, s);
 
 	if (new)
 		new->allocflags |= __GFP_KMEMCG;
 
-	kfree(name);
 	return new;
 }
 
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
 static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 						  struct kmem_cache *cachep)
 {
-- 
cgit v1.2.3


From 8cc9934520e7f752fe45d5199664d741ba24a932 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 09:29:50 -0700
Subject: cgroup, cpuset: replace move_member_tasks_to_cpuset() with
 cgroup_transfer_tasks()

When a cpuset becomes empty (no CPU or memory), its tasks are
transferred with the nearest ancestor with execution resources.  This
is implemented using cgroup_scan_tasks() with a callback which grabs
cgroup_mutex and invokes cgroup_attach_task() on each task.

Both cgroup_mutex and cgroup_attach_task() are scheduled to be
unexported.  Implement cgroup_transfer_tasks() in cgroup proper which
is essentially the same as move_member_tasks_to_cpuset() except that
it takes cgroups instead of cpusets and @to comes before @from like
normal functions with those arguments, and replace
move_member_tasks_to_cpuset() with it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 28 +++++++++++++++++++++++++++
 kernel/cpuset.c        | 51 ++++++--------------------------------------------
 3 files changed, 35 insertions(+), 45 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 01c48c6806d6..f8eb01d75ddc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -696,6 +696,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 		       bool threadgroup);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
 /*
  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4aee5bdd66c8..147d7ccb3b0b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3269,6 +3269,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 	return 0;
 }
 
+static void cgroup_transfer_one_task(struct task_struct *task,
+				     struct cgroup_scanner *scan)
+{
+	struct cgroup *new_cgroup = scan->data;
+
+	cgroup_lock();
+	cgroup_attach_task(new_cgroup, task, false);
+	cgroup_unlock();
+}
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+	struct cgroup_scanner scan;
+
+	scan.cg = from;
+	scan.test_task = NULL; /* select all tasks in cgroup */
+	scan.process_task = cgroup_transfer_one_task;
+	scan.heap = NULL;
+	scan.data = to;
+
+	return cgroup_scan_tasks(&scan);
+}
+
 /*
  * Stuff for reading the 'tasks'/'procs' files.
  *
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 98d458aad789..866d78ee7930 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1994,50 +1994,6 @@ int __init cpuset_init(void)
 	return 0;
 }
 
-/**
- * cpuset_do_move_task - move a given task to another cpuset
- * @tsk: pointer to task_struct the task to move
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- * Return nonzero to stop the walk through the tasks.
- */
-static void cpuset_do_move_task(struct task_struct *tsk,
-				struct cgroup_scanner *scan)
-{
-	struct cgroup *new_cgroup = scan->data;
-
-	cgroup_lock();
-	cgroup_attach_task(new_cgroup, tsk, false);
-	cgroup_unlock();
-}
-
-/**
- * move_member_tasks_to_cpuset - move tasks from one cpuset to another
- * @from: cpuset in which the tasks currently reside
- * @to: cpuset to which the tasks will be moved
- *
- * Called with cpuset_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
- *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- */
-static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
-{
-	struct cgroup_scanner scan;
-
-	scan.cg = from->css.cgroup;
-	scan.test_task = NULL; /* select all tasks in cgroup */
-	scan.process_task = cpuset_do_move_task;
-	scan.heap = NULL;
-	scan.data = to->css.cgroup;
-
-	if (cgroup_scan_tasks(&scan))
-		printk(KERN_ERR "move_member_tasks_to_cpuset: "
-				"cgroup_scan_tasks failed\n");
-}
-
 /*
  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2058,7 +2014,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 			nodes_empty(parent->mems_allowed))
 		parent = parent_cs(parent);
 
-	move_member_tasks_to_cpuset(cs, parent);
+	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+		rcu_read_lock();
+		printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
+		       cgroup_name(cs->css.cgroup));
+		rcu_read_unlock();
+	}
 }
 
 /**
-- 
cgit v1.2.3


From 7ae1bad99e27b8838d480a24edf4646a2fc547df Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 09:29:51 -0700
Subject: cgroup: relocate cgroup_lock_live_group() and
 cgroup_attach_task_all()

cgroup_lock_live_group() and cgroup_attach_task() are scheduled to be
made static.  Relocate the former and cgroup_attach_task_all() so that
we don't need forward declarations.

This patch is pure relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 84 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 147d7ccb3b0b..ae7617095dec 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -329,6 +329,24 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 	return __d_cfe(dentry)->type;
 }
 
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the lock should be later released with
+ * cgroup_unlock(). On failure returns false with no lock held.
+ */
+bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+	mutex_lock(&cgroup_mutex);
+	if (cgroup_is_removed(cgrp)) {
+		mutex_unlock(&cgroup_mutex);
+		return false;
+	}
+	return true;
+}
+EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
+
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
@@ -1943,30 +1961,6 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
 	put_css_set(oldcg);
 }
 
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
-	struct cgroupfs_root *root;
-	int retval = 0;
-
-	cgroup_lock();
-	for_each_active_root(root) {
-		struct cgroup *from_cg = task_cgroup_from_root(from, root);
-
-		retval = cgroup_attach_task(from_cg, tsk, false);
-		if (retval)
-			break;
-	}
-	cgroup_unlock();
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
 /**
  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
  * @cgrp: the cgroup to attach to
@@ -2204,6 +2198,30 @@ out_unlock_cgroup:
 	return ret;
 }
 
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+	struct cgroupfs_root *root;
+	int retval = 0;
+
+	cgroup_lock();
+	for_each_active_root(root) {
+		struct cgroup *from_cg = task_cgroup_from_root(from, root);
+
+		retval = cgroup_attach_task(from_cg, tsk, false);
+		if (retval)
+			break;
+	}
+	cgroup_unlock();
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 {
 	return attach_task_by_pid(cgrp, pid, false);
@@ -2214,24 +2232,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 	return attach_task_by_pid(cgrp, tgid, true);
 }
 
-/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
- */
-bool cgroup_lock_live_group(struct cgroup *cgrp)
-{
-	mutex_lock(&cgroup_mutex);
-	if (cgroup_is_removed(cgrp)) {
-		mutex_unlock(&cgroup_mutex);
-		return false;
-	}
-	return true;
-}
-EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
-
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
-- 
cgit v1.2.3


From b9777cf8d7c7854c3c38bd6621d993b85c2afcdf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 09:29:51 -0700
Subject: cgroup: unexport locking interface and cgroup_attach_task()

Now that all external cgroup_lock() users are gone, we can finally
unexport the locking interface and prevent future abuse of
cgroup_mutex.

Make cgroup_[un]lock() and cgroup_lock_live_group() static.  Also,
cgroup_attach_task() doesn't have any user left and can't be used
without locking interface anyway.  Make it static too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 5 -----
 kernel/cgroup.c        | 9 +++------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f8eb01d75ddc..63deb70f3149 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -30,10 +30,7 @@ struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
-extern void cgroup_lock(void);
 extern int cgroup_lock_is_held(void);
-extern bool cgroup_lock_live_group(struct cgroup *cgrp);
-extern void cgroup_unlock(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
@@ -693,8 +690,6 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
-		       bool threadgroup);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ae7617095dec..32ca0304452f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -336,7 +336,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
  * On success, returns true; the lock should be later released with
  * cgroup_unlock(). On failure returns false with no lock held.
  */
-bool cgroup_lock_live_group(struct cgroup *cgrp)
+static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
 	if (cgroup_is_removed(cgrp)) {
@@ -345,7 +345,6 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
 	}
 	return true;
 }
-EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
@@ -824,22 +823,20 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * cgroup_lock - lock out any changes to cgroup structures
  *
  */
-void cgroup_lock(void)
+static void cgroup_lock(void)
 {
 	mutex_lock(&cgroup_mutex);
 }
-EXPORT_SYMBOL_GPL(cgroup_lock);
 
 /**
  * cgroup_unlock - release lock on cgroup changes
  *
  * Undo the lock taken in a previous cgroup_lock() call.
  */
-void cgroup_unlock(void)
+static void cgroup_unlock(void)
 {
 	mutex_unlock(&cgroup_mutex);
 }
-EXPORT_SYMBOL_GPL(cgroup_unlock);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
-- 
cgit v1.2.3


From 47cfcd0922454e49f4923b1e2d31a5bf199237c3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 09:29:51 -0700
Subject: cgroup: kill cgroup_[un]lock()

Now that locking interface is unexported, there's no reason to keep
around these thin wrappers.  Kill them and use mutex operations
directly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 32ca0304452f..1a65958c1a0b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -333,8 +333,8 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
  *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
+ * On success, returns true; the mutex should be later unlocked.  On
+ * failure returns false with no lock held.
  */
 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
@@ -819,25 +819,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
-/**
- * cgroup_lock - lock out any changes to cgroup structures
- *
- */
-static void cgroup_lock(void)
-{
-	mutex_lock(&cgroup_mutex);
-}
-
-/**
- * cgroup_unlock - release lock on cgroup changes
- *
- * Undo the lock taken in a previous cgroup_lock() call.
- */
-static void cgroup_unlock(void)
-{
-	mutex_unlock(&cgroup_mutex);
-}
-
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
  * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -1967,8 +1948,8 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
  * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
  * task_lock of @tsk or each thread in the threadgroup individually in turn.
  */
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
-		       bool threadgroup)
+static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+			      bool threadgroup)
 {
 	int retval, i, group_size;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
@@ -2191,7 +2172,7 @@ retry_find_task:
 
 	put_task_struct(tsk);
 out_unlock_cgroup:
-	cgroup_unlock();
+	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 
@@ -2205,7 +2186,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	struct cgroupfs_root *root;
 	int retval = 0;
 
-	cgroup_lock();
+	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		struct cgroup *from_cg = task_cgroup_from_root(from, root);
 
@@ -2213,7 +2194,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (retval)
 			break;
 	}
-	cgroup_unlock();
+	mutex_unlock(&cgroup_mutex);
 
 	return retval;
 }
@@ -2240,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 	mutex_lock(&cgroup_root_mutex);
 	strcpy(cgrp->root->release_agent_path, buffer);
 	mutex_unlock(&cgroup_root_mutex);
-	cgroup_unlock();
+	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 
@@ -2251,7 +2232,7 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 		return -ENODEV;
 	seq_puts(seq, cgrp->root->release_agent_path);
 	seq_putc(seq, '\n');
-	cgroup_unlock();
+	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 
@@ -3271,9 +3252,9 @@ static void cgroup_transfer_one_task(struct task_struct *task,
 {
 	struct cgroup *new_cgroup = scan->data;
 
-	cgroup_lock();
+	mutex_lock(&cgroup_mutex);
 	cgroup_attach_task(new_cgroup, task, false);
-	cgroup_unlock();
+	mutex_unlock(&cgroup_mutex);
 }
 
 /**
-- 
cgit v1.2.3


From 2219449a65ace0290cd9c2260ff337e326b8be8a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 09:29:51 -0700
Subject: cgroup: remove cgroup_lock_is_held()

We don't want controllers to assume that the information is officially
available and do funky things with it.

The only user is task_subsys_state_check() which uses it to verify RCU
access context.  We can move cgroup_lock_is_held() inside
CONFIG_PROVE_RCU but that doesn't add meaningful protection compared
to conditionally exposing cgroup_mutex.

Remove cgroup_lock_is_held(), export cgroup_mutex iff CONFIG_PROVE_RCU
and use lockdep_is_held() directly on the mutex in
task_subsys_state_check().

While at it, add parentheses around macro arguments in
task_subsys_state_check().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 13 +++++++++----
 kernel/cgroup.c        | 20 ++++++--------------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 63deb70f3149..515927eebb37 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -30,7 +30,6 @@ struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
-extern int cgroup_lock_is_held(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
@@ -552,10 +551,16 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
  * rcu_dereference_check() conditions, such as locks used during the
  * cgroup_subsys::attach() methods.
  */
+#ifdef CONFIG_PROVE_RCU
+extern struct mutex cgroup_mutex;
 #define task_subsys_state_check(task, subsys_id, __c)			\
-	rcu_dereference_check(task->cgroups->subsys[subsys_id],		\
-			      lockdep_is_held(&task->alloc_lock) ||	\
-			      cgroup_lock_is_held() || (__c))
+	rcu_dereference_check((task)->cgroups->subsys[(subsys_id)],	\
+			      lockdep_is_held(&(task)->alloc_lock) ||	\
+			      lockdep_is_held(&cgroup_mutex) || (__c))
+#else
+#define task_subsys_state_check(task, subsys_id, __c)			\
+	rcu_dereference((task)->cgroups->subsys[(subsys_id)])
+#endif
 
 static inline struct cgroup_subsys_state *
 task_subsys_state(struct task_struct *task, int subsys_id)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1a65958c1a0b..ba3e24a76dae 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -83,7 +83,13 @@
  * B happens only through cgroup_show_options() and using cgroup_root_mutex
  * breaks it.
  */
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+EXPORT_SYMBOL_GPL(cgroup_mutex);	/* only for task_subsys_state_check() */
+#else
 static DEFINE_MUTEX(cgroup_mutex);
+#endif
+
 static DEFINE_MUTEX(cgroup_root_mutex);
 
 /*
@@ -251,20 +257,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			      struct cftype cfts[], bool is_add);
 
-#ifdef CONFIG_PROVE_LOCKING
-int cgroup_lock_is_held(void)
-{
-	return lockdep_is_held(&cgroup_mutex);
-}
-#else /* #ifdef CONFIG_PROVE_LOCKING */
-int cgroup_lock_is_held(void)
-{
-	return mutex_is_locked(&cgroup_mutex);
-}
-#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
-
-EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
-
 static int css_unbias_refcnt(int refcnt)
 {
 	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-- 
cgit v1.2.3


From 8adf12b0ffd0a10340bab355e586f8533a69630d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 10:25:39 -0700
Subject: devcg: remove broken_hierarchy tag

bd2953ebbb ("devcg: propagate local changes down the hierarchy")
implemented proper hierarchy support.  Remove the broken tag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Aristeu Rozanski <aris@redhat.com>
---
 security/device_cgroup.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 221967d4690c..994aeccea518 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -739,15 +739,6 @@ struct cgroup_subsys devices_subsys = {
 	.css_offline = devcgroup_offline,
 	.subsys_id = devices_subsys_id,
 	.base_cftypes = dev_cgroup_files,
-
-	/*
-	 * While devices cgroup has the rudimentary hierarchy support which
-	 * checks the parent's restriction, it doesn't properly propagates
-	 * config changes in ancestors to their descendents.  A child
-	 * should only be allowed to add more restrictions to the parent's
-	 * configuration.  Fix it and remove the following.
-	 */
-	.broken_hierarchy = true,
 };
 
 /**
-- 
cgit v1.2.3


From 84cfb6ab484b442d5115eb3baf9db7d74a3ea626 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Wed, 10 Apr 2013 14:41:17 +0300
Subject: cgroup: remove bind() method from cgroup_subsys.

The bind() method of cgroup_subsys is not used in any of the
controllers (cpuset, freezer, blkio, net_cls, memcg, net_prio,
devices, perf, hugetlb, cpu and cpuacct)

tj: Removed the entry on ->bind() from
    Documentation/cgroups/cgroups.txt.  Also updated a couple
    paragraphs which were suggesting that dynamic re-binding may be
    implemented.  It's not gonna.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroups/cgroups.txt | 20 +++++---------------
 include/linux/cgroup.h            |  2 --
 kernel/cgroup.c                   |  4 ----
 3 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 638bf17ff869..2b51e12ce178 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -211,10 +211,9 @@ matches, and any of the requested subsystems are in use in an existing
 hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
 is activated, associated with the requested subsystems.
 
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
+It's not possible to bind a new subsystem to an active cgroup
+hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy.
 
 When a cgroup filesystem is unmounted, if there are any
 child cgroups created below the top-level cgroup, that hierarchy
@@ -382,10 +381,8 @@ To Specify a hierarchy's release_agent:
 
 Note that specifying 'release_agent' more than once will return failure.
 
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
+Note that changing the set of subsystems is only supported when the
+hierarchy consists of a single (root) cgroup.
 
 Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
 tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
@@ -643,13 +640,6 @@ void exit(struct task_struct *task)
 
 Called during task exit.
 
-void bind(struct cgroup *root)
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
 
 4. Extended attribute usage
 ===========================
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 515927eebb37..92acf8601ae0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -483,8 +483,6 @@ struct cgroup_subsys {
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
 		     struct task_struct *task);
-	void (*bind)(struct cgroup *root);
-
 	int subsys_id;
 	int active;
 	int disabled;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ba3e24a76dae..fd38e1cfacca 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1064,16 +1064,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
-			if (ss->bind)
-				ss->bind(cgrp);
 			/* refcount was already taken, and we're keeping it */
 		} else if (bit & removed_mask) {
 			/* We're removing this subsystem */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
-			if (ss->bind)
-				ss->bind(dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
-- 
cgit v1.2.3


From 415cf07a1c1c65249773330434878ae7bcd92d0f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Apr 2013 14:35:02 +0800
Subject: cgroup: make sure parent won't be destroyed before its children

Suppose we rmdir a cgroup and there're still css refs, this cgroup won't
be freed. Then we rmdir the parent cgroup, and the parent is freed
immediately due to css ref draining to 0. Now it would be a disaster if
the still-alive child cgroup tries to access its parent.

Make sure this won't happen.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fd38e1cfacca..65b72d0367c5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -875,6 +875,13 @@ static void cgroup_free_fn(struct work_struct *work)
 	cgrp->root->number_of_cgroups--;
 	mutex_unlock(&cgroup_mutex);
 
+	/*
+	 * We get a ref to the parent's dentry, and put the ref when
+	 * this cgroup is being freed, so it's guaranteed that the
+	 * parent won't be destroyed before its children.
+	 */
+	dput(cgrp->parent->dentry);
+
 	/*
 	 * Drop the active superblock reference that we took when we
 	 * created the cgroup
@@ -4164,6 +4171,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	for_each_subsys(root, ss)
 		dget(dentry);
 
+	/* hold a ref to the parent's dentry */
+	dget(parent->dentry);
+
 	/* creation succeeded, notify subsystems */
 	for_each_subsys(root, ss) {
 		err = online_css(ss, cgrp);
-- 
cgit v1.2.3


From 78574cf981cd3d9ae9f6adbd466a772310ec24ff Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Apr 2013 19:00:38 -0700
Subject: cgroup: implement cgroup_is_descendant()

A couple controllers want to determine whether two cgroups are in
ancestor/descendant relationship.  As it's more likely that the
descendant is the primary subject of interest and there are other
operations focusing on the descendants, let's ask is_descendent rather
than is_ancestor.

Implementation is trivial as the previous patch guarantees that all
ancestors of a cgroup stay accessible as long as the cgroup is
accessible.

tj: Removed depth optimization, renamed from cgroup_is_ancestor(),
    rewrote descriptions.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 92acf8601ae0..a2b9d4b13369 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -439,6 +439,7 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 
 int cgroup_is_removed(const struct cgroup *cgrp);
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 65b72d0367c5..7bf3ce09c50c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -276,6 +276,26 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
 	return test_bit(CGRP_REMOVED, &cgrp->flags);
 }
 
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
+ * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+	while (cgrp) {
+		if (cgrp == ancestor)
+			return true;
+		cgrp = cgrp->parent;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(cgroup_is_descendant);
+
 /* bits in struct cgroupfs_root flags field */
 enum {
 	ROOT_NOPREFIX,	/* mounted subsystems have no named prefix */
-- 
cgit v1.2.3


From ef824fa129b7579f56b92d466ecda2e378879806 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 8 Apr 2013 19:00:38 -0700
Subject: perf: make perf_event cgroup hierarchical

perf_event is one of a couple remaining cgroup controllers with broken
hierarchy support.  Converting it to support hierarchy is almost
trivial.  The only thing necessary is to consider a task belonging to
a descendant cgroup as a match.  IOW, if the cgroup of the currently
executing task (@cpuctx->cgrp) equals or is a descendant of the
event's cgroup (@event->cgrp), then the event should be enabled.

Implement hierarchy support and remove .broken_hierarchy tag along
with the incorrect comment on what needs to be done for hierarchy
support.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Stephane Eranian <eranian@google.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
---
 kernel/events/core.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0cd86501c30..310ec19d968a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -251,7 +251,22 @@ perf_cgroup_match(struct perf_event *event)
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
-	return !event->cgrp || event->cgrp == cpuctx->cgrp;
+	/* @event doesn't care about cgroup */
+	if (!event->cgrp)
+		return true;
+
+	/* wants specific cgroup scope but @cpuctx isn't associated with any */
+	if (!cpuctx->cgrp)
+		return false;
+
+	/*
+	 * Cgroup scoping is recursive.  An event enabled for a cgroup is
+	 * also enabled for all its descendant cgroups.  If @cpuctx's
+	 * cgroup is a descendant of @event's (the test covers identity
+	 * case), it's a match.
+	 */
+	return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+				    event->cgrp->css.cgroup);
 }
 
 static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -7509,12 +7524,5 @@ struct cgroup_subsys perf_subsys = {
 	.css_free	= perf_cgroup_css_free,
 	.exit		= perf_cgroup_exit,
 	.attach		= perf_cgroup_attach,
-
-	/*
-	 * perf_event cgroup doesn't handle nesting correctly.
-	 * ctx->nr_cgroups adjustments should be propagated through the
-	 * cgroup hierarchy.  Fix it and remove the following.
-	 */
-	.broken_hierarchy = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
-- 
cgit v1.2.3


From 26d5bbe5ba2073fc7ef9e69a55543b2376f5bad0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 12 Apr 2013 10:29:04 -0700
Subject: Revert "cgroup: remove bind() method from cgroup_subsys."

This reverts commit 84cfb6ab484b442d5115eb3baf9db7d74a3ea626.  There
are scheduled changes which make use of the removed callback.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rami Rosen <ramirose@gmail.com>
Cc: Li Zefan <lizefan@huawei.com>
---
 Documentation/cgroups/cgroups.txt | 20 +++++++++++++++-----
 include/linux/cgroup.h            |  2 ++
 kernel/cgroup.c                   |  4 ++++
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 2b51e12ce178..638bf17ff869 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -211,9 +211,10 @@ matches, and any of the requested subsystems are in use in an existing
 hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
 is activated, associated with the requested subsystems.
 
-It's not possible to bind a new subsystem to an active cgroup
-hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy.
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
 
 When a cgroup filesystem is unmounted, if there are any
 child cgroups created below the top-level cgroup, that hierarchy
@@ -381,8 +382,10 @@ To Specify a hierarchy's release_agent:
 
 Note that specifying 'release_agent' more than once will return failure.
 
-Note that changing the set of subsystems is only supported when the
-hierarchy consists of a single (root) cgroup.
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
 
 Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
 tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
@@ -640,6 +643,13 @@ void exit(struct task_struct *task)
 
 Called during task exit.
 
+void bind(struct cgroup *root)
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
 
 4. Extended attribute usage
 ===========================
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a2b9d4b13369..45aee0fc6b98 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -484,6 +484,8 @@ struct cgroup_subsys {
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
 		     struct task_struct *task);
+	void (*bind)(struct cgroup *root);
+
 	int subsys_id;
 	int active;
 	int disabled;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7bf3ce09c50c..678a22c75fdb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1091,12 +1091,16 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
+			if (ss->bind)
+				ss->bind(cgrp);
 			/* refcount was already taken, and we're keeping it */
 		} else if (bit & removed_mask) {
 			/* We're removing this subsystem */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+			if (ss->bind)
+				ss->bind(dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
-- 
cgit v1.2.3


From da1f296fd2bfd5ad3c53d72a1ece593e821cf374 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 14 Apr 2013 10:32:19 -0700
Subject: cgroup: make cgroup_path() not print double slashes

While reimplementing cgroup_path(), 65dff759d2 ("cgroup: fix
cgroup_path() vs rename() race") introduced a bug where the path of a
non-root cgroup would have two slahses at the beginning, which is
caused by treating the root cgroup which has the name '/' like
non-root cgroups.

 $ grep systemd /proc/self/cgroup
 1:name=systemd://user/root/1

Fix it by special casing root cgroup case and not looping over it in
the normal path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 678a22c75fdb..faf55f59646b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1811,11 +1811,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 	int ret = -ENAMETOOLONG;
 	char *start;
 
+	if (!cgrp->parent) {
+		if (strlcpy(buf, "/", buflen) >= buflen)
+			return -ENAMETOOLONG;
+		return 0;
+	}
+
 	start = buf + buflen - 1;
 	*start = '\0';
 
 	rcu_read_lock();
-	while (cgrp) {
+	do {
 		const char *name = cgroup_name(cgrp);
 		int len;
 
@@ -1824,15 +1830,12 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 			goto out;
 		memcpy(start, name, len);
 
-		if (!cgrp->parent)
-			break;
-
 		if (--start < buf)
 			goto out;
 		*start = '/';
 
 		cgrp = cgrp->parent;
-	}
+	} while (cgrp->parent);
 	ret = 0;
 	memmove(buf, start, buf + buflen - start);
 out:
-- 
cgit v1.2.3


From 9343862945fdd3dd5cb7648bb24cabe40faa9ad9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 14 Apr 2013 20:15:25 -0700
Subject: cgroup: convert cgroupfs_root flag bits to masks and add CGRP_ prefix

There's no reason to be using bitops, which tends to be more
cumbersome, to handle root flags.  Convert them to masks.  Also, as
they'll be moved to include/linux/cgroup.h and it's generally a good
idea, add CGRP_ prefix.

Note that flags are assigned from (1 << 1).  The first bit will be
used by a flag which will be added soon.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index faf55f59646b..67428d84e38b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -296,10 +296,10 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 }
 EXPORT_SYMBOL_GPL(cgroup_is_descendant);
 
-/* bits in struct cgroupfs_root flags field */
+/* cgroupfs_root->flags */
 enum {
-	ROOT_NOPREFIX,	/* mounted subsystems have no named prefix */
-	ROOT_XATTR,	/* supports extended attributes */
+	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
+	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
 };
 
 static int cgroup_is_releasable(const struct cgroup *cgrp)
@@ -1137,9 +1137,9 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	mutex_lock(&cgroup_root_mutex);
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
-	if (test_bit(ROOT_NOPREFIX, &root->flags))
+	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
-	if (test_bit(ROOT_XATTR, &root->flags))
+	if (root->flags & CGRP_ROOT_XATTR)
 		seq_puts(seq, ",xattr");
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1202,7 +1202,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			continue;
 		}
 		if (!strcmp(token, "noprefix")) {
-			set_bit(ROOT_NOPREFIX, &opts->flags);
+			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
 		}
 		if (!strcmp(token, "clone_children")) {
@@ -1210,7 +1210,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			continue;
 		}
 		if (!strcmp(token, "xattr")) {
-			set_bit(ROOT_XATTR, &opts->flags);
+			opts->flags |= CGRP_ROOT_XATTR;
 			continue;
 		}
 		if (!strncmp(token, "release_agent=", 14)) {
@@ -1293,8 +1293,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	 * with the old cpuset, so we allow noprefix only if mounting just
 	 * the cpuset subsystem.
 	 */
-	if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
-	    (opts->subsys_mask & mask))
+	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
 		return -EINVAL;
 
 
@@ -2526,7 +2525,7 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
 static inline int xattr_enabled(struct dentry *dentry)
 {
 	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-	return test_bit(ROOT_XATTR, &root->flags);
+	return root->flags & CGRP_ROOT_XATTR;
 }
 
 static bool is_valid_xattr(const char *name)
@@ -2698,7 +2697,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 
 	simple_xattrs_init(&cft->xattrs);
 
-	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
+	if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
 	}
-- 
cgit v1.2.3


From 25a7e6848db76e22677aff202d9c4ef3503be15b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 14 Apr 2013 20:15:25 -0700
Subject: move cgroupfs_root to include/linux/cgroup.h

While controllers shouldn't be accessing cgroupfs_root directly, it
being hidden inside kern/cgroup.c makes somethings pretty silly.  This
makes routing hierarchy-wide settings which need to be visible to
controllers cumbersome.

We're gonna add another hierarchy-wide setting which needs to be
accessed from controllers.  Move cgroupfs_root and its flags to the
header file so that we can access root settings with inline helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c        | 57 --------------------------------------------------
 2 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 45aee0fc6b98..b21881e1ea08 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/workqueue.h>
 #include <linux/xattr.h>
+#include <linux/fs.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -238,6 +239,62 @@ struct cgroup {
 	struct simple_xattrs xattrs;
 };
 
+#define MAX_CGROUP_ROOT_NAMELEN 64
+
+/* cgroupfs_root->flags */
+enum {
+	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
+	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+};
+
+/*
+ * A cgroupfs_root represents the root of a cgroup hierarchy, and may be
+ * associated with a superblock to form an active hierarchy.  This is
+ * internal to cgroup core.  Don't access directly from controllers.
+ */
+struct cgroupfs_root {
+	struct super_block *sb;
+
+	/*
+	 * The bitmask of subsystems intended to be attached to this
+	 * hierarchy
+	 */
+	unsigned long subsys_mask;
+
+	/* Unique id for this hierarchy. */
+	int hierarchy_id;
+
+	/* The bitmask of subsystems currently attached to this hierarchy */
+	unsigned long actual_subsys_mask;
+
+	/* A list running through the attached subsystems */
+	struct list_head subsys_list;
+
+	/* The root cgroup for this hierarchy */
+	struct cgroup top_cgroup;
+
+	/* Tracks how many cgroups are currently defined in hierarchy.*/
+	int number_of_cgroups;
+
+	/* A list running through the active hierarchies */
+	struct list_head root_list;
+
+	/* All cgroups on this root, cgroup_mutex protected */
+	struct list_head allcg_list;
+
+	/* Hierarchy-specific flags */
+	unsigned long flags;
+
+	/* IDs for cgroups in this hierarchy */
+	struct ida cgroup_ida;
+
+	/* The path to use for release notifications. */
+	char release_agent_path[PATH_MAX];
+
+	/* The name for this hierarchy - may be empty */
+	char name[MAX_CGROUP_ROOT_NAMELEN];
+};
+
 /*
  * A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 67428d84e38b..8b8eb7c168ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
 #include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
-#include <linux/fs.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
@@ -104,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
 
-#define MAX_CGROUP_ROOT_NAMELEN 64
-
-/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
- */
-struct cgroupfs_root {
-	struct super_block *sb;
-
-	/*
-	 * The bitmask of subsystems intended to be attached to this
-	 * hierarchy
-	 */
-	unsigned long subsys_mask;
-
-	/* Unique id for this hierarchy. */
-	int hierarchy_id;
-
-	/* The bitmask of subsystems currently attached to this hierarchy */
-	unsigned long actual_subsys_mask;
-
-	/* A list running through the attached subsystems */
-	struct list_head subsys_list;
-
-	/* The root cgroup for this hierarchy */
-	struct cgroup top_cgroup;
-
-	/* Tracks how many cgroups are currently defined in hierarchy.*/
-	int number_of_cgroups;
-
-	/* A list running through the active hierarchies */
-	struct list_head root_list;
-
-	/* All cgroups on this root, cgroup_mutex protected */
-	struct list_head allcg_list;
-
-	/* Hierarchy-specific flags */
-	unsigned long flags;
-
-	/* IDs for cgroups in this hierarchy */
-	struct ida cgroup_ida;
-
-	/* The path to use for release notifications. */
-	char release_agent_path[PATH_MAX];
-
-	/* The name for this hierarchy - may be empty */
-	char name[MAX_CGROUP_ROOT_NAMELEN];
-};
-
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
@@ -296,12 +245,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 }
 EXPORT_SYMBOL_GPL(cgroup_is_descendant);
 
-/* cgroupfs_root->flags */
-enum {
-	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
-	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
-};
-
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
-- 
cgit v1.2.3


From 873fe09ea5df6ccf6bb34811d8c9992aacb67598 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 14 Apr 2013 20:15:26 -0700
Subject: cgroup: introduce sane_behavior mount option

It's a sad fact that at this point various cgroup controllers are
carrying so many idiosyncrasies and pure insanities that it simply
isn't possible to reach any sort of sane consistent behavior while
maintaining staying fully compatible with what already has been
exposed to userland.

As we can't break exposed userland interface, transitioning to sane
behaviors can only be done in steps while maintaining backwards
compatibility.  This patch introduces a new mount option -
__DEVEL__sane_behavior - which disables crazy features and enforces
consistent behaviors in cgroup core proper and various controllers.
As exactly which behaviors it changes are still being determined, the
mount option, at this point, is useful only for development of the new
behaviors.  As such, the mount option is prefixed with __DEVEL__ and
generates a warning message when used.

Eventually, once we get to the point where all controller's behaviors
are consistent enough to implement unified hierarchy, the __DEVEL__
prefix will be dropped, and more importantly, unified-hierarchy will
enforce sane_behavior by default.  Maybe we'll able to completely drop
the crazy stuff after a while, maybe not, but we at least have a
strategy to move on to saner behaviors.

This patch introduces the mount option and changes the following
behaviors in cgroup core.

* Mount options "noprefix" and "clone_children" are disallowed.  Also,
  cgroupfs file cgroup.clone_children is not created.

* When mounting an existing superblock, mount options should match.
  This is currently pretty crazy.  If one mounts a cgroup, creates a
  subdirectory, unmounts it and then mount it again with different
  option, it looks like the new options are applied but they aren't.

* Remount is disallowed.

The behaviors changes are documented in the comment above
CGRP_ROOT_SANE_BEHAVIOR enum and will be expanded as different
controllers are converted and planned improvements progress.

v2: Dropped unnecessary explicit file permission setting sane_behavior
    cftype entry as suggested by Li Zefan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vivek Goyal <vgoyal@redhat.com>
---
 include/linux/cgroup.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b21881e1ea08..9c300ad9a911 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -156,6 +156,8 @@ enum {
 	 * specified at mount time and thus is implemented here.
 	 */
 	CGRP_CPUSET_CLONE_CHILDREN,
+	/* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
+	CGRP_SANE_BEHAVIOR,
 };
 
 struct cgroup_name {
@@ -243,6 +245,37 @@ struct cgroup {
 
 /* cgroupfs_root->flags */
 enum {
+	/*
+	 * Unfortunately, cgroup core and various controllers are riddled
+	 * with idiosyncrasies and pointless options.  The following flag,
+	 * when set, will force sane behavior - some options are forced on,
+	 * others are disallowed, and some controllers will change their
+	 * hierarchical or other behaviors.
+	 *
+	 * The set of behaviors affected by this flag are still being
+	 * determined and developed and the mount option for this flag is
+	 * prefixed with __DEVEL__.  The prefix will be dropped once we
+	 * reach the point where all behaviors are compatible with the
+	 * planned unified hierarchy, which will automatically turn on this
+	 * flag.
+	 *
+	 * The followings are the behaviors currently affected this flag.
+	 *
+	 * - Mount options "noprefix" and "clone_children" are disallowed.
+	 *   Also, cgroupfs file cgroup.clone_children is not created.
+	 *
+	 * - When mounting an existing superblock, mount options should
+	 *   match.
+	 *
+	 * - Remount is disallowed.
+	 *
+	 * The followings are planned changes.
+	 *
+	 * - release_agent will be disallowed once replacement notification
+	 *   mechanism is implemented.
+	 */
+	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0),
+
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
 };
@@ -360,6 +393,7 @@ struct cgroup_map_cb {
 /* cftype->flags */
 #define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
 #define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create on root cg */
+#define CFTYPE_INSANE		(1U << 2)	/* don't create if sane_behavior */
 
 #define MAX_CFTYPE_NAME		64
 
@@ -486,6 +520,15 @@ struct cgroup_scanner {
 	void *data;
 };
 
+/*
+ * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
+ * function can be called as long as @cgrp is accessible.
+ */
+static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
+{
+	return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
+}
+
 /* Caller should hold rcu_read_lock() */
 static inline const char *cgroup_name(const struct cgroup *cgrp)
 {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8b8eb7c168ff..67804590d4b0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1080,6 +1080,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	mutex_lock(&cgroup_root_mutex);
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
+	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+		seq_puts(seq, ",sane_behavior");
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1144,6 +1146,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			all_ss = true;
 			continue;
 		}
+		if (!strcmp(token, "__DEVEL__sane_behavior")) {
+			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+			continue;
+		}
 		if (!strcmp(token, "noprefix")) {
 			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
@@ -1231,6 +1237,20 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 
 	/* Consistency checks */
 
+	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+
+		if (opts->flags & CGRP_ROOT_NOPREFIX) {
+			pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
+			return -EINVAL;
+		}
+
+		if (opts->cpuset_clone_children) {
+			pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
+			return -EINVAL;
+		}
+	}
+
 	/*
 	 * Option noprefix was introduced just for backward compatibility
 	 * with the old cpuset, so we allow noprefix only if mounting just
@@ -1307,6 +1327,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroup_sb_opts opts;
 	unsigned long added_mask, removed_mask;
 
+	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+		pr_err("cgroup: sane_behavior: remount is not allowed\n");
+		return -EINVAL;
+	}
+
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 	mutex_lock(&cgroup_root_mutex);
@@ -1657,6 +1682,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 * any) is not needed
 		 */
 		cgroup_drop_root(opts.new_root);
+
+		if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
+		    root->flags != opts.flags) {
+			pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+			ret = -EINVAL;
+			goto drop_new_super;
+		}
+
 		/* no subsys rebinding, so refcounts don't change */
 		drop_parsed_module_refcounts(opts.subsys_mask);
 	}
@@ -2200,6 +2233,13 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
+static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *seq)
+{
+	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+	return 0;
+}
+
 /* A buffer size big enough for numbers or short strings */
 #define CGROUP_LOCAL_BUFFER_SIZE 64
 
@@ -2681,6 +2721,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
+		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+			continue;
 		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 			continue;
 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
@@ -3918,9 +3960,15 @@ static struct cftype files[] = {
 	},
 	{
 		.name = "cgroup.clone_children",
+		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_clone_children_read,
 		.write_u64 = cgroup_clone_children_write,
 	},
+	{
+		.name = "cgroup.sane_behavior",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cgroup_sane_behavior_show,
+	},
 	{
 		.name = "release_agent",
 		.flags = CFTYPE_ONLY_ON_ROOT,
-- 
cgit v1.2.3


From 05fb22ec5456a472a5eadcaacb3e51eca1f8c79c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 15 Apr 2013 14:25:05 +0800
Subject: cgroup: remove cgrp->top_cgroup

It's not used, and it can be retrieved via cgrp->root->top_cgroup.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c        | 2 --
 2 files changed, 3 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9c300ad9a911..64047ae7fde1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -204,7 +204,6 @@ struct cgroup {
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
 	struct cgroupfs_root *root;
-	struct cgroup *top_cgroup;
 
 	/*
 	 * List of cg_cgroup_links pointing at css_sets with
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 67804590d4b0..c16719e056ab 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1418,7 +1418,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->name = &root_cgroup_name;
-	cgrp->top_cgroup = cgrp;
 	init_cgroup_housekeeping(cgrp);
 	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
@@ -4145,7 +4144,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	cgrp->parent = parent;
 	cgrp->root = parent->root;
-	cgrp->top_cgroup = parent->top_cgroup;
 
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-- 
cgit v1.2.3


From f00baae7ad6c5f1503528efa852f0be8e9513f0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 15 Apr 2013 13:41:15 -0700
Subject: memcg: force use_hierarchy if sane_behavior

Turn on use_hierarchy by default if sane_behavior is specified and
don't create .use_hierarchy file.

It is debatable whether to remove .use_hierarchy file or make it ro as
the former could make transition easier in certain cases; however, the
behavior changes which will be gated by sane_behavior are intensive
including changing basic meaning of certain control knobs in a few
controllers and I don't really think keeping this piece would make
things easier in any noticeable way, so let's remove it.

v2: Explain that mem_cgroup_bind() doesn't have to worry about
    children as suggested by Michal Hocko.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h |  3 +++
 mm/memcontrol.c        | 17 +++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 64047ae7fde1..cda7eb2239e1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -268,6 +268,9 @@ enum {
 	 *
 	 * - Remount is disallowed.
 	 *
+	 * - memcg: use_hierarchy is on by default and the cgroup file for
+	 *   the flag is not created.
+	 *
 	 * The followings are planned changes.
 	 *
 	 * - release_agent will be disallowed once replacement notification
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9715c0c491b0..df87bdd4d692 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5814,6 +5814,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "use_hierarchy",
+		.flags = CFTYPE_INSANE,
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
@@ -6784,6 +6785,21 @@ static void mem_cgroup_move_task(struct cgroup *cont,
 }
 #endif
 
+/*
+ * Cgroup retains root cgroups across [un]mount cycles making it necessary
+ * to verify sane_behavior flag on each mount attempt.
+ */
+static void mem_cgroup_bind(struct cgroup *root)
+{
+	/*
+	 * use_hierarchy is forced with sane_behavior.  cgroup core
+	 * guarantees that @root doesn't have any children, so turning it
+	 * on for the root memcg is enough.
+	 */
+	if (cgroup_sane_behavior(root))
+		mem_cgroup_from_cont(root)->use_hierarchy = true;
+}
+
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
 	.subsys_id = mem_cgroup_subsys_id,
@@ -6794,6 +6810,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
+	.bind = mem_cgroup_bind,
 	.base_cftypes = mem_cgroup_files,
 	.early_init = 0,
 	.use_id = 1,
-- 
cgit v1.2.3


From e57d5cf2f894e3f2727f8cf74bed0bc81cae70c8 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Tue, 16 Apr 2013 23:24:55 +0300
Subject: devcg: remove parent_cgroup.

In devcgroup_css_alloc(), there is no longer need for parent_cgroup.
bd2953ebbb("devcg: propagate local changes down the hierarchy") made
the variable parent_cgroup redundant. This patch removes parent_cgroup
from devcgroup_css_alloc().

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 security/device_cgroup.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 994aeccea518..dd0dc574d78d 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -236,7 +236,6 @@ static void devcgroup_offline(struct cgroup *cgroup)
 static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
 {
 	struct dev_cgroup *dev_cgroup;
-	struct cgroup *parent_cgroup;
 
 	dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
 	if (!dev_cgroup)
@@ -244,7 +243,6 @@ static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
 	INIT_LIST_HEAD(&dev_cgroup->exceptions);
 	INIT_LIST_HEAD(&dev_cgroup->propagate_pending);
 	dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
-	parent_cgroup = cgroup->parent;
 
 	return &dev_cgroup->css;
 }
-- 
cgit v1.2.3


From 712317ad97f41e738e1a19aa0a6392a78a84094e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 18 Apr 2013 23:09:52 -0700
Subject: cgroup: fix broken file xattrs

We should store file xattrs in struct cfent instead of struct cftype,
because cftype is a type while cfent is object instance of cftype.

For example each cgroup has a tasks file, and each tasks file is
associated with a uniq cfent, but all those files share the same
struct cftype.

Alexey Kodanev reported a crash, which can be reproduced:

  # mount -t cgroup -o xattr /sys/fs/cgroup
  # mkdir /sys/fs/cgroup/test
  # setfattr -n trusted.value -v test_value /sys/fs/cgroup/tasks
  # rmdir /sys/fs/cgroup/test
  # umount /sys/fs/cgroup
  oops!

In this case, simple_xattrs_free() will free the same struct simple_xattrs
twice.

tj: Dropped unused local variable @cft from cgroup_diput().

Cc: <stable@vger.kernel.org> # 3.8.x
Reported-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  3 ---
 kernel/cgroup.c        | 11 ++++++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cda7eb2239e1..c371888298d5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -422,9 +422,6 @@ struct cftype {
 	/* CFTYPE_* flags */
 	unsigned int flags;
 
-	/* file xattrs */
-	struct simple_xattrs xattrs;
-
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c16719e056ab..192d762ec1f4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -117,6 +117,9 @@ struct cfent {
 	struct list_head		node;
 	struct dentry			*dentry;
 	struct cftype			*type;
+
+	/* file xattrs */
+	struct simple_xattrs		xattrs;
 };
 
 /*
@@ -882,13 +885,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 	} else {
 		struct cfent *cfe = __d_cfe(dentry);
 		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
-		struct cftype *cft = cfe->type;
 
 		WARN_ONCE(!list_empty(&cfe->node) &&
 			  cgrp != &cgrp->root->top_cgroup,
 			  "cfe still linked for %s\n", cfe->type->name);
+		simple_xattrs_free(&cfe->xattrs);
 		kfree(cfe);
-		simple_xattrs_free(&cft->xattrs);
 	}
 	iput(inode);
 }
@@ -2501,7 +2503,7 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
 	if (S_ISDIR(dentry->d_inode->i_mode))
 		return &__d_cgrp(dentry)->xattrs;
 	else
-		return &__d_cft(dentry)->xattrs;
+		return &__d_cfe(dentry)->xattrs;
 }
 
 static inline int xattr_enabled(struct dentry *dentry)
@@ -2677,8 +2679,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 	umode_t mode;
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 
-	simple_xattrs_init(&cft->xattrs);
-
 	if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
@@ -2703,6 +2703,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		cfe->type = (void *)cft;
 		cfe->dentry = dentry;
 		dentry->d_fsdata = cfe;
+		simple_xattrs_init(&cfe->xattrs);
 		list_add_tail(&cfe->node, &parent->files);
 		cfe = NULL;
 	}
-- 
cgit v1.2.3


From cc20e01cd607282d48f8ea538aba10fa850a4312 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 26 Apr 2013 11:58:02 -0700
Subject: cgroup: fix use-after-free when umounting cgroupfs

Try:
  # mount -t cgroup xxx /cgroup
  # mkdir /cgroup/sub && rmdir /cgroup/sub && umount /cgroup

And you might see this:

ida_remove called for id=1 which is not allocated.

It's because cgroup_kill_sb() is called to destroy root->cgroup_ida
and free cgrp->root before ida_simple_removed() is called. What's
worse is we're accessing cgrp->root while it has been freed.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 192d762ec1f4..bd4de465d5a9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -848,9 +848,12 @@ static void cgroup_free_fn(struct work_struct *work)
 	 */
 	dput(cgrp->parent->dentry);
 
+	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+
 	/*
 	 * Drop the active superblock reference that we took when we
-	 * created the cgroup
+	 * created the cgroup. This will free cgrp->root, if we are
+	 * holding the last reference to @sb.
 	 */
 	deactivate_super(cgrp->root->sb);
 
@@ -862,7 +865,6 @@ static void cgroup_free_fn(struct work_struct *work)
 
 	simple_xattrs_free(&cgrp->xattrs);
 
-	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
 	kfree(rcu_dereference_raw(cgrp->name));
 	kfree(cgrp);
 }
-- 
cgit v1.2.3


From 7ef70e48735e17d2be5c8e8f85052842b16b923a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 26 Apr 2013 11:58:03 -0700
Subject: cgroup: restore the call to eventfd->poll()

I mistakenly removed the call to eventfd->poll() while I was actually
intending to remove the return value...

Calling evenfd->poll() will hook cgroup_event_wake() to the poll
waitqueue, which will be called to unregister eventfd when rmdir a
cgroup or close eventfd.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bd4de465d5a9..3f14a1310d23 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3882,6 +3882,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
 	if (ret)
 		goto fail;
 
+	efile->f_op->poll(efile, &event->pt);
+
 	/*
 	 * Events should be removed after rmdir of cgroup directory, but before
 	 * destroying subsystem state objects. Let's take reference to cgroup
-- 
cgit v1.2.3


From e0e80a02e5701c8790bd348ab59edb154fbda60b Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Sat, 27 Apr 2013 06:52:43 -0700
Subject: cpuset: use rebuild_sched_domains() in cpuset_hotplug_workfn()

In cpuset_hotplug_workfn(), partition_sched_domains() is called without
hotplug lock held, which is actually needed (stated in the function
header of partition_sched_domains()).

This patch tries to use rebuild_sched_domains() to solve the above
issue, and makes the code looks a little simpler.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 866d78ee7930..8f0f45e002f2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2172,17 +2172,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	flush_workqueue(cpuset_propagate_hotplug_wq);
 
 	/* rebuild sched domains if cpus_allowed has changed */
-	if (cpus_updated) {
-		struct sched_domain_attr *attr;
-		cpumask_var_t *doms;
-		int ndoms;
-
-		mutex_lock(&cpuset_mutex);
-		ndoms = generate_sched_domains(&doms, &attr);
-		mutex_unlock(&cpuset_mutex);
-
-		partition_sched_domains(ndoms, doms, attr);
-	}
+	if (cpus_updated)
+		rebuild_sched_domains();
 }
 
 void cpuset_update_active_cpus(bool cpu_online)
-- 
cgit v1.2.3


From 5b16c2a493fe1e439c4e7ad51f58153968ca6cf3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Sat, 27 Apr 2013 06:52:43 -0700
Subject: cpuset: fix cpu hotplug vs rebuild_sched_domains() race

rebuild_sched_domains() might pass doms with offlined cpu to
partition_sched_domains(), which results in an oops:

general protection fault: 0000 [#1] SMP
...
RIP: 0010:[<ffffffff81077a1e>]  [<ffffffff81077a1e>] get_group+0x6e/0x90
...
Call Trace:
 [<ffffffff8107f07c>] build_sched_domains+0x70c/0xcb0
 [<ffffffff8107f2a7>] ? build_sched_domains+0x937/0xcb0
 [<ffffffff81173f64>] ? kfree+0xe4/0x1b0
 [<ffffffff8107f6e0>] ? partition_sched_domains+0xc0/0x470
 [<ffffffff8107f905>] partition_sched_domains+0x2e5/0x470
 [<ffffffff8107f6e0>] ? partition_sched_domains+0xc0/0x470
 [<ffffffff810c9007>] ? generate_sched_domains+0xc7/0x530
 [<ffffffff810c94a8>] rebuild_sched_domains_locked+0x38/0x70
 [<ffffffff810cb4a4>] cpuset_write_resmask+0x1a4/0x500
 [<ffffffff810c8700>] ? cpuset_mount+0xe0/0xe0
 [<ffffffff810c7f50>] ? cpuset_read_u64+0x100/0x100
 [<ffffffff810be890>] ? cgroup_iter_next+0x90/0x90
 [<ffffffff810cb300>] ? cpuset_css_offline+0x70/0x70
 [<ffffffff810c1a73>] cgroup_file_write+0x133/0x2e0
 [<ffffffff8118995b>] vfs_write+0xcb/0x130
 [<ffffffff8118a174>] sys_write+0x64/0xa0

Reported-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8f0f45e002f2..93d140f159cc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -769,12 +769,20 @@ static void rebuild_sched_domains_locked(void)
 	lockdep_assert_held(&cpuset_mutex);
 	get_online_cpus();
 
+	/*
+	 * We have raced with CPU hotplug. Don't do anything to avoid
+	 * passing doms with offlined cpu to partition_sched_domains().
+	 * Anyways, hotplug work item will rebuild sched domains.
+	 */
+	if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+		goto out;
+
 	/* Generate domain masks and attrs */
 	ndoms = generate_sched_domains(&doms, &attr);
 
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
-
+out:
 	put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-- 
cgit v1.2.3


From 2a0010af17b1739ef8ea8cf02647a127241ee674 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Sun, 28 Apr 2013 09:46:57 +0800
Subject: cpuset: fix compile warning when CONFIG_SMP=n

Reported by Fengguang's kbuild test robot:

kernel/cpuset.c:787: warning: 'generate_sched_domains' defined but not used

Introduced by commit e0e80a02e5701c8790bd348ab59edb154fbda60b
("cpuset: use rebuild_sched_domains() in cpuset_hotplug_workfn()),
which removed generate_sched_domains() from cpuset_hotplug_workfn().

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 93d140f159cc..9bd30b9c430c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -789,13 +789,6 @@ out:
 static void rebuild_sched_domains_locked(void)
 {
 }
-
-static int generate_sched_domains(cpumask_var_t **domains,
-			struct sched_domain_attr **attributes)
-{
-	*domains = NULL;
-	return 1;
-}
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
-- 
cgit v1.2.3