14 files changed, 717 insertions, 37 deletions
diff --git a/CREDITS b/CREDITS
index 1d616640bbf6..4fcf9cd8544c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3219,6 +3219,11 @@ S: 69 rue Dunois
 S: 75013 Paris
 S: France
 
+N: Aleksa Sarai
+E: cyphar@cyphar.com
+W: https://www.cyphar.com/
+D: `pids` cgroup subsystem
+
 N: Dipankar Sarma
 E: dipankar@in.ibm.com
 D: RCU
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
index 96ce071a3633..3f5a40f57d4a 100644
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroups/00-INDEX
@@ -22,6 +22,8 @@ net_cls.txt
 	- Network classifier cgroups details and usages.
 net_prio.txt
 	- Network priority cgroups details and usages.
+pids.txt
+	- Process number cgroups details and usages.
 resource_counter.txt
 	- Resource Counter API.
 unified-hierarchy.txt
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroups/pids.txt
new file mode 100644
index 000000000000..1a078b5d281a
--- /dev/null
+++ b/Documentation/cgroups/pids.txt
@@ -0,0 +1,85 @@
+						   Process Number Controller
+						   =========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+Example
+-------
+
+First, we mount the pids controller:
+# mkdir -p /sys/fs/cgroup/pids
+# mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it:
+# mkdir -p /sys/fs/cgroup/pids/parent/child
+# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+#
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail:
+
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+# ( /bin/echo "Here's some processes for you." | cat )
+sh: fork: Resource temporary unavailable
+#
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's):
+
+# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+# cat /sys/fs/cgroup/pids/parent/child/pids.current
+2
+# cat /sys/fs/cgroup/pids/parent/child/pids.max
+max
+# ( /bin/echo "Here's some processes for you." | cat )
+sh: fork: Resource temporary unavailable
+#
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current):
+
+# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+# /bin/echo "We can't even spawn a single process now."
+sh: fork: Resource temporary unavailable
+# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+# /bin/echo "We can't even spawn a single process now."
+sh: fork: Resource temporary unavailable
+#
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 86847a7647ab..1ee9caf29e57 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -23,10 +23,13 @@ CONTENTS
 5. Other Changes
   5-1. [Un]populated Notification
   5-2. Other Core Changes
-  5-3. Per-Controller Changes
-    5-3-1. blkio
-    5-3-2. cpuset
-    5-3-3. memory
+  5-3. Controller File Conventions
+    5-3-1. Format
+    5-3-2. Control Knobs
+  5-4. Per-Controller Changes
+    5-4-1. blkio
+    5-4-2. cpuset
+    5-4-3. memory
 6. Planned Changes
   6-1. CAP for resource control
 
@@ -372,14 +375,75 @@ supported and the interface files "release_agent" and
 - The "cgroup.clone_children" file is removed.
 
 
-5-3. Per-Controller Changes
+5-3. Controller File Conventions
 
-5-3-1. blkio
+5-3-1. Format
+
+In general, all controller files should be in one of the following
+formats whenever possible.
+
+- Values only files
+
+  VAL0 VAL1...\n
+
+- Flat keyed files
+
+  KEY0 VAL0\n
+  KEY1 VAL1\n
+  ...
+
+- Nested keyed files
+
+  KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
+  KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
+  ...
+
+For a writeable file, the format for writing should generally match
+reading; however, controllers may allow omitting later fields or
+implement restricted shortcuts for most common use cases.
+
+For both flat and nested keyed files, only the values for a single key
+can be written at a time.  For nested keyed files, the sub key pairs
+may be specified in any order and not all pairs have to be specified.
+
+
+5-3-2. Control Knobs
+
+- Settings for a single feature should generally be implemented in a
+  single file.
+
+- In general, the root cgroup should be exempt from resource control
+  and thus shouldn't have resource control knobs.
+
+- If a controller implements ratio based resource distribution, the
+  control knob should be named "weight" and have the range [1, 10000]
+  and 100 should be the default value.  The values are chosen to allow
+  enough and symmetric bias in both directions while keeping it
+  intuitive (the default is 100%).
+
+- If a controller implements an absolute resource guarantee and/or
+  limit, the control knobs should be named "min" and "max"
+  respectively.  If a controller implements best effort resource
+  gurantee and/or limit, the control knobs should be named "low" and
+  "high" respectively.
+
+  In the above four control files, the special token "max" should be
+  used to represent upward infinity for both reading and writing.
+
+- If a setting has configurable default value and specific overrides,
+  the default settings should be keyed with "default" and appear as
+  the first entry in the file.  Specific entries can use "default" as
+  its value to indicate inheritance of the default value.
+
+
+5-4. Per-Controller Changes
+
+5-4-1. blkio
 
 - blk-throttle becomes properly hierarchical.
 
 
-5-3-2. cpuset
+5-4-2. cpuset
 
 - Tasks are kept in empty cpusets after hotplug and take on the masks
   of the nearest non-empty ancestor, instead of being moved to it.
@@ -388,7 +452,7 @@ supported and the interface files "release_agent" and
   masks of the nearest non-empty ancestor.
 
 
-5-3-3. memory
+5-4-3. memory
 
 - use_hierarchy is on by default and the cgroup file for the flag is
   not created.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 93755a629299..4d8fcf2187dc 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -34,12 +34,17 @@ struct seq_file;
 
 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
+#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
+	__unused_tag_ ## _t = CGROUP_ ## _t - 1,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT,
 };
+#undef SUBSYS_TAG
 #undef SUBSYS
 
+#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
+
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_NO_REF	= (1 << 0), /* no reference counting for this css */
@@ -318,7 +323,7 @@ struct cftype {
 	 * end of cftype array.
 	 */
 	char name[MAX_CFTYPE_NAME];
-	int private;
+	unsigned long private;
 	/*
 	 * If not 0, file mode is set to this value, otherwise it will
 	 * be figured out automatically
@@ -406,7 +411,9 @@ struct cgroup_subsys {
 			      struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_subsys_state *css,
 		       struct cgroup_taskset *tset);
-	void (*fork)(struct task_struct *task);
+	int (*can_fork)(struct task_struct *task, void **priv_p);
+	void (*cancel_fork)(struct task_struct *task, void *priv);
+	void (*fork)(struct task_struct *task, void *priv);
 	void (*exit)(struct cgroup_subsys_state *css,
 		     struct cgroup_subsys_state *old_css,
 		     struct task_struct *task);
@@ -434,6 +441,9 @@ struct cgroup_subsys {
 	int id;
 	const char *name;
 
+	/* optional, initialized automatically during boot if not set */
+	const char *legacy_name;
+
 	/* link to parent, protected by cgroup_lock() */
 	struct cgroup_root *root;
 
@@ -491,6 +501,7 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 
 #else	/* CONFIG_CGROUPS */
 
+#define CGROUP_CANFORK_COUNT 0
 #define CGROUP_SUBSYS_COUNT 0
 
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a593e299162e..eb7ca55f72ef 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,15 @@
 
 #ifdef CONFIG_CGROUPS
 
+/*
+ * All weight knobs on the default hierarhcy should use the following min,
+ * default and max values.  The default value is the logarithmic center of
+ * MIN and MAX and allows 100x to be expressed in both directions.
+ */
+#define CGROUP_WEIGHT_MIN		1
+#define CGROUP_WEIGHT_DFL		100
+#define CGROUP_WEIGHT_MAX		10000
+
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
 	struct cgroup_subsys		*ss;
@@ -62,7 +71,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		     struct pid *pid, struct task_struct *tsk);
 
 void cgroup_fork(struct task_struct *p);
-void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p,
+			   void *ss_priv[CGROUP_CANFORK_COUNT]);
+extern void cgroup_cancel_fork(struct task_struct *p,
+			       void *ss_priv[CGROUP_CANFORK_COUNT]);
+extern void cgroup_post_fork(struct task_struct *p,
+			     void *old_ss_priv[CGROUP_CANFORK_COUNT]);
 void cgroup_exit(struct task_struct *p);
 
 int cgroup_init_early(void);
@@ -524,7 +538,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 				    struct dentry *dentry) { return -EINVAL; }
 
 static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p,
+				  void *ss_priv[CGROUP_CANFORK_COUNT])
+{ return 0; }
+static inline void cgroup_cancel_fork(struct task_struct *p,
+				      void *ss_priv[CGROUP_CANFORK_COUNT]) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+				    void *ss_priv[CGROUP_CANFORK_COUNT]) {}
 static inline void cgroup_exit(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb14403..1f36945fd23d 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,6 +3,17 @@
  *
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
+
+/*
+ * This file *must* be included with SUBSYS() defined.
+ * SUBSYS_TAG() is a noop if undefined.
+ */
+
+#ifndef SUBSYS_TAG
+#define __TMP_SUBSYS_TAG
+#define SUBSYS_TAG(_x)
+#endif
+
 #if IS_ENABLED(CONFIG_CPUSETS)
 SUBSYS(cpuset)
 #endif
@@ -48,11 +59,28 @@ SUBSYS(hugetlb)
 #endif
 
 /*
+ * Subsystems that implement the can_fork() family of callbacks.
+ */
+SUBSYS_TAG(CANFORK_START)
+
+#if IS_ENABLED(CONFIG_CGROUP_PIDS)
+SUBSYS(pids)
+#endif
+
+SUBSYS_TAG(CANFORK_END)
+
+/*
  * The following subsystems are not supported on the default hierarchy.
  */
 #if IS_ENABLED(CONFIG_CGROUP_DEBUG)
 SUBSYS(debug)
 #endif
+
+#ifdef __TMP_SUBSYS_TAG
+#undef __TMP_SUBSYS_TAG
+#undef SUBSYS_TAG
+#endif
+
 /*
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
diff --git a/init/Kconfig b/init/Kconfig
index ba1e6eaf4c36..bb9b4dd55889 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -947,6 +947,22 @@ config CGROUP_FREEZER
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 
+config CGROUP_PIDS
+	bool "PIDs cgroup subsystem"
+	help
+	  Provides enforcement of process number limits in the scope of a
+	  cgroup. Any attempt to fork more processes than is allowed in the
+	  cgroup will fail. PIDs are fundamentally a global resource because it
+	  is fairly trivial to reach PID exhaustion before you reach even a
+	  conservative kmemcg limit. As a result, it is possible to grind a
+	  system to halt without being limited by other cgroup policies. The
+	  PIDs cgroup subsystem is designed to stop this from happening.
+
+	  It should be noted that organisational operations (such as attaching
+	  to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+	  since the PIDs limit only affects a process's ability to fork, not to
+	  attach to a cgroup.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..718fb8afab7a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b89f3168411b..f3f5cd5e2c0d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = {
  * part of that cgroup.
  */
 struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
  * The default hierarchy always exists but is hidden until mounted for the
@@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1;
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
 
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
+
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
@@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 
 	idr_preload(gfp_mask);
 	spin_lock_bh(&cgroup_idr_lock);
-	ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
 	spin_unlock_bh(&cgroup_idr_lock);
 	idr_preload_end();
 	return ret;
@@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations;
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
 {
+	struct cgroup_subsys *ss = cft->ss;
+
 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
-			 cft->ss->name, cft->name);
+			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+			 cft->name);
 	else
 		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
@@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq,
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	for_each_subsys(ss, ssid)
-		if (root->subsys_mask & (1 << ssid))
-			seq_printf(seq, ",%s", ss->name);
+	if (root != &cgrp_dfl_root)
+		for_each_subsys(ss, ssid)
+			if (root->subsys_mask & (1 << ssid))
+				seq_printf(seq, ",%s", ss->legacy_name);
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1447,7 +1455,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 		}
 
 		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name))
+			if (strcmp(token, ss->legacy_name))
 				continue;
 			if (ss->disabled)
 				continue;
@@ -1666,7 +1674,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
@@ -4579,7 +4587,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 	if (err)
 		goto err_free_css;
 
-	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
 	if (err < 0)
 		goto err_free_percpu_ref;
 	css->id = err;
@@ -4656,7 +4664,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
 	 */
-	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		ret = -ENOMEM;
 		goto out_cancel_ref;
@@ -4955,6 +4963,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
 	have_fork_callback |= (bool)ss->fork << ss->id;
 	have_exit_callback |= (bool)ss->exit << ss->id;
+	have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -4993,6 +5002,8 @@ int __init cgroup_init_early(void)
 
 		ss->id = i;
 		ss->name = cgroup_subsys_name[i];
+		if (!ss->legacy_name)
+			ss->legacy_name = cgroup_subsys_name[i];
 
 		if (ss->early_init)
 			cgroup_init_subsys(ss, true);
@@ -5136,9 +5147,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			continue;
 
 		seq_printf(m, "%d:", root->hierarchy_id);
-		for_each_subsys(ss, ssid)
-			if (root->subsys_mask & (1 << ssid))
-				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+		if (root != &cgrp_dfl_root)
+			for_each_subsys(ss, ssid)
+				if (root->subsys_mask & (1 << ssid))
+					seq_printf(m, "%s%s", count++ ? "," : "",
+						   ss->legacy_name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
@@ -5178,7 +5191,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 
 	for_each_subsys(ss, i)
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
-			   ss->name, ss->root->hierarchy_id,
+			   ss->legacy_name, ss->root->hierarchy_id,
 			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
 
 	mutex_unlock(&cgroup_mutex);
@@ -5197,6 +5210,19 @@ static const struct file_operations proc_cgroupstats_operations = {
 	.release = single_release,
 };
 
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+	if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+		return &ss_priv[i - CGROUP_CANFORK_START];
+	return NULL;
+}
+
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+	void **private = subsys_canfork_priv_p(ss_priv, i);
+	return private ? *private : NULL;
+}
+
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -5212,6 +5238,57 @@ void cgroup_fork(struct task_struct *child)
 }
 
 /**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+		    void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+	struct cgroup_subsys *ss;
+	int i, j, ret;
+
+	for_each_subsys_which(ss, i, &have_canfork_callback) {
+		ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+		if (ret)
+			goto out_revert;
+	}
+
+	return 0;
+
+out_revert:
+	for_each_subsys(ss, j) {
+		if (j >= i)
+			break;
+		if (ss->cancel_fork)
+			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+	}
+
+	return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+			void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	for_each_subsys(ss, i)
+		if (ss->cancel_fork)
+			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+
+/**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
  *
@@ -5221,7 +5298,8 @@ void cgroup_fork(struct task_struct *child)
  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
  * list.
  */
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+		      void *old_ss_priv[CGROUP_CANFORK_COUNT])
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -5266,7 +5344,7 @@ void cgroup_post_fork(struct task_struct *child)
 	 * and addition to css_set.
 	 */
 	for_each_subsys_which(ss, i, &have_fork_callback)
-		ss->fork(child);
+		ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
 }
 
 /**
@@ -5400,12 +5478,14 @@ static int __init cgroup_disable(char *str)
 			continue;
 
 		for_each_subsys(ss, i) {
-			if (!strcmp(token, ss->name)) {
-				ss->disabled = 1;
-				printk(KERN_INFO "Disabling %s control group"
-					" subsystem\n", ss->name);
-				break;
-			}
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+
+			ss->disabled = 1;
+			printk(KERN_INFO "Disabling %s control group subsystem\n",
+			       ss->name);
+			break;
 		}
 	}
 	return 1;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
  * to do anything as freezer_attach() will put @task into the appropriate
  * state.
  */
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
 {
 	struct freezer *freezer;
 
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..806cd7693ac8
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,355 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * Use 64-bit types so that we can safely represent "max" as
+	 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+	 */
+	atomic64_t			counter;
+	int64_t				limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+	return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct pids_cgroup *pids;
+
+	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+	if (!pids)
+		return ERR_PTR(-ENOMEM);
+
+	pids->limit = PIDS_MAX;
+	atomic64_set(&pids->counter, 0);
+	return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+	/*
+	 * A negative count (or overflow for that matter) is invalid,
+	 * and indicates a bug in the `pids` controller proper.
+	 */
+	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; p; p = parent_pids(p))
+		pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; p; p = parent_pids(p))
+		atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p, *q;
+
+	for (p = pids; p; p = parent_pids(p)) {
+		int64_t new = atomic64_add_return(num, &p->counter);
+
+		/*
+		 * Since new is capped to the maximum number of pid_t, if
+		 * p->limit is %PIDS_MAX then we know that this test will never
+		 * fail.
+		 */
+		if (new > p->limit)
+			goto revert;
+	}
+
+	return 0;
+
+revert:
+	for (q = pids; q != p; q = parent_pids(q))
+		pids_cancel(q, num);
+	pids_cancel(p, num);
+
+	return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+			   struct cgroup_taskset *tset)
+{
+	struct pids_cgroup *pids = css_pids(css);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, tset) {
+		struct cgroup_subsys_state *old_css;
+		struct pids_cgroup *old_pids;
+
+		/*
+		 * No need to pin @old_css between here and cancel_attach()
+		 * because cgroup core protects it from being freed before
+		 * the migration completes or fails.
+		 */
+		old_css = task_css(task, pids_cgrp_id);
+		old_pids = css_pids(old_css);
+
+		pids_charge(pids, 1);
+		pids_uncharge(old_pids, 1);
+	}
+
+	return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+			       struct cgroup_taskset *tset)
+{
+	struct pids_cgroup *pids = css_pids(css);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, tset) {
+		struct cgroup_subsys_state *old_css;
+		struct pids_cgroup *old_pids;
+
+		old_css = task_css(task, pids_cgrp_id);
+		old_pids = css_pids(old_css);
+
+		pids_charge(old_pids, 1);
+		pids_uncharge(pids, 1);
+	}
+}
+
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+	struct cgroup_subsys_state *css;
+	struct pids_cgroup *pids;
+	int err;
+
+	/*
+	 * Use the "current" task_css for the pids subsystem as the tentative
+	 * css. It is possible we will charge the wrong hierarchy, in which
+	 * case we will forcefully revert/reapply the charge on the right
+	 * hierarchy after it is committed to the task proper.
+	 */
+	css = task_get_css(current, pids_cgrp_id);
+	pids = css_pids(css);
+
+	err = pids_try_charge(pids, 1);
+	if (err)
+		goto err_css_put;
+
+	*priv_p = css;
+	return 0;
+
+err_css_put:
+	css_put(css);
+	return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+	struct cgroup_subsys_state *css = priv;
+	struct pids_cgroup *pids = css_pids(css);
+
+	pids_uncharge(pids, 1);
+	css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *priv)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup_subsys_state *old_css = priv;
+	struct pids_cgroup *pids;
+	struct pids_cgroup *old_pids = css_pids(old_css);
+
+	css = task_get_css(task, pids_cgrp_id);
+	pids = css_pids(css);
+
+	/*
+	 * If the association has changed, we have to revert and reapply the
+	 * charge/uncharge on the wrong hierarchy to the current one. Since
+	 * the association can only change due to an organisation event, its
+	 * okay for us to ignore the limit in this case.
+	 */
+	if (pids != old_pids) {
+		pids_uncharge(old_pids, 1);
+		pids_charge(pids, 1);
+	}
+
+	css_put(css);
+	css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+		      struct cgroup_subsys_state *old_css,
+		      struct task_struct *task)
+{
+	struct pids_cgroup *pids = css_pids(old_css);
+
+	pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit;
+	int err;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, PIDS_MAX_STR)) {
+		limit = PIDS_MAX;
+		goto set_limit;
+	}
+
+	err = kstrtoll(buf, 0, &limit);
+	if (err)
+		return err;
+
+	if (limit < 0 || limit >= PIDS_MAX)
+		return -EINVAL;
+
+set_limit:
+	/*
+	 * Limit updates don't need to be mutex'd, since it isn't
+	 * critical that any racing fork()s follow the new limit.
+	 */
+	pids->limit = limit;
+	return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit = pids->limit;
+
+	if (limit >= PIDS_MAX)
+		seq_printf(sf, "%s\n", PIDS_MAX_STR);
+	else
+		seq_printf(sf, "%lld\n", limit);
+
+	return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct pids_cgroup *pids = css_pids(css);
+
+	return atomic64_read(&pids->counter);
+}
+
+static struct cftype pids_files[] = {
+	{
+		.name = "max",
+		.write = pids_max_write,
+		.seq_show = pids_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.read_s64 = pids_current_read,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+	.css_alloc	= pids_css_alloc,
+	.css_free	= pids_css_free,
+	.can_attach 	= pids_can_attach,
+	.cancel_attach 	= pids_cancel_attach,
+	.can_fork	= pids_can_fork,
+	.cancel_fork	= pids_cancel_fork,
+	.fork		= pids_fork,
+	.exit		= pids_exit,
+	.legacy_cftypes	= pids_files,
+	.dfl_cftypes	= pids_files,
+};
diff --git a/kernel/fork.c b/kernel/fork.c
index 2b1a61cddc19..03aa2e6de7a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1246,6 +1246,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
 	int retval;
 	struct task_struct *p;
+	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1518,6 +1519,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->task_works = NULL;
 
 	/*
+	 * Ensure that the cgroup subsystem policies allow the new process to be
+	 * forked. It should be noted the the new process's css_set can be changed
+	 * between here and cgroup_post_fork() if an organisation operation is in
+	 * progress.
+	 */
+	retval = cgroup_can_fork(p, cgrp_ss_priv);
+	if (retval)
+		goto bad_fork_free_pid;
+
+	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
 	 */
@@ -1553,7 +1564,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_pid;
+		goto bad_fork_cancel_cgroup;
 	}
 
 	if (likely(p->pid)) {
@@ -1595,7 +1606,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	write_unlock_irq(&tasklist_lock);
 
 	proc_fork_connector(p);
-	cgroup_post_fork(p);
+	cgroup_post_fork(p, cgrp_ss_priv);
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	perf_event_fork(p);
@@ -1605,6 +1616,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	return p;
 
+bad_fork_cancel_cgroup:
+	cgroup_cancel_fork(p, cgrp_ss_priv);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b864ecee0e1..d8420c233ff7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8133,7 +8133,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 	sched_offline_group(tg);
 }
 
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
 {
 	sched_move_task(task);
 }