From ef2c41cf38a7559bbf91af42d5b6a4429db8fc68 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Feb 2020 14:26:22 +0100 Subject: clone3: allow spawning processes into cgroups This adds support for creating a process in a different cgroup than its parent. Callers can limit and account processes and threads right from the moment they are spawned: - A service manager can directly spawn new services into dedicated cgroups. - A process can be directly created in a frozen cgroup and will be frozen as well. - The initial accounting jitter experienced by process supervisors and daemons is eliminated with this. - Threaded applications or even thread implementations can choose to create a specific cgroup layout where each thread is spawned directly into a dedicated cgroup. This feature is limited to the unified hierarchy. Callers need to pass a directory file descriptor for the target cgroup. The caller can choose to pass an O_PATH file descriptor. All usual migration restrictions apply, i.e. there can be no processes in inner nodes. In general, creating a process directly in a target cgroup adheres to all migration restrictions. One of the biggest advantages of this feature is that CLONE_INTO_GROUP does not need to grab the write side of the cgroup cgroup_threadgroup_rwsem. This global lock makes moving tasks/threads around super expensive. With clone3() this lock is avoided. Cc: Tejun Heo Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Johannes Weiner Cc: Li Zefan Cc: Peter Zijlstra Cc: cgroups@vger.kernel.org Signed-off-by: Christian Brauner Signed-off-by: Tejun Heo --- kernel/fork.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 9245b6e53f55..635d6369dfb9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process( * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p); + retval = cgroup_can_fork(p, args); if (retval) goto bad_fork_put_pidfd; @@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); + cgroup_post_fork(p, args); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process( bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - cgroup_cancel_fork(p); + cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); @@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, !valid_signal(args.exit_signal))) return -EINVAL; + if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0) + return -EINVAL; + *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), @@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack_size = args.stack_size, .tls = args.tls, .set_tid_size = args.set_tid_size, + .cgroup = args.cgroup, }; if (args.set_tid && @@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ - if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) + if (kargs->flags & + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) return false; /* -- cgit v1.2.3