diff options
author | Roman Gushchin <guro@fb.com> | 2019-04-19 10:03:04 -0700 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2019-04-19 11:26:48 -0700 |
commit | 76f969e8948d82e78e1bc4beb6b9465908e74873 (patch) | |
tree | 1f5459d94820c5e5ea7293b103e8531d389c15c1 /include | |
parent | 4dcabece4c3a9f9522127be12cc12cc120399b2f (diff) | |
download | linux-76f969e8948d82e78e1bc4beb6b9465908e74873.tar.gz linux-76f969e8948d82e78e1bc4beb6b9465908e74873.tar.bz2 linux-76f969e8948d82e78e1bc4beb6b9465908e74873.zip |
cgroup: cgroup v2 freezer
Cgroup v1 implements the freezer controller, which provides an ability
to stop the workload in a cgroup and temporarily free up some
resources (cpu, io, network bandwidth and, potentially, memory)
for some other tasks. Cgroup v2 lacks this functionality.
This patch implements freezer for cgroup v2.
Cgroup v2 freezer tries to put tasks into a state similar to jobctl
stop. This means that tasks can be killed, ptraced (using
PTRACE_SEIZE*), and interrupted. It is possible to attach to
a frozen task, get some information (e.g. read registers) and detach.
It's also possible to migrate a frozen tasks to another cgroup.
This differs cgroup v2 freezer from cgroup v1 freezer, which mostly
tried to imitate the system-wide freezer. However uninterruptible
sleep is fine when all tasks are going to be frozen (hibernation case),
it's not the acceptable state for some subset of the system.
Cgroup v2 freezer is not supporting freezing kthreads.
If a non-root cgroup contains kthread, the cgroup still can be frozen,
but the kthread will remain running, the cgroup will be shown
as non-frozen, and the notification will not be delivered.
* PTRACE_ATTACH is not working because non-fatal signal delivery
is blocked in frozen state.
There are some interface differences between cgroup v1 and cgroup v2
freezer too, which are required to conform the cgroup v2 interface
design principles:
1) There is no separate controller, which has to be turned on:
the functionality is always available and is represented by
cgroup.freeze and cgroup.events cgroup control files.
2) The desired state is defined by the cgroup.freeze control file.
Any hierarchical configuration is allowed.
3) The interface is asynchronous. The actual state is available
using cgroup.events control file ("frozen" field). There are no
dedicated transitional states.
4) It's allowed to make any changes with the cgroup hierarchy
(create new cgroups, remove old cgroups, move tasks between cgroups)
no matter if some cgroups are frozen.
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
No-objection-from-me-by: Oleg Nesterov <oleg@redhat.com>
Cc: kernel-team@fb.com
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/cgroup-defs.h | 28 | ||||
-rw-r--r-- | include/linux/cgroup.h | 43 | ||||
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | include/linux/sched/jobctl.h | 2 |
4 files changed, 75 insertions, 0 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7d57890cec67..77258d276f93 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -65,6 +65,12 @@ enum { * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, + + /* Control group has to be frozen. */ + CGRP_FREEZE, + + /* Cgroup is frozen. */ + CGRP_FROZEN, }; /* cgroup_root->flags */ @@ -317,6 +323,25 @@ struct cgroup_rstat_cpu { struct cgroup *updated_next; /* NULL iff not on the list */ }; +struct cgroup_freezer_state { + /* Should the cgroup and its descendants be frozen. */ + bool freeze; + + /* Should the cgroup actually be frozen? */ + int e_freeze; + + /* Fields below are protected by css_set_lock */ + + /* Number of frozen descendant cgroups */ + int nr_frozen_descendants; + + /* + * Number of tasks, which are counted as frozen: + * frozen, SIGSTOPped, and PTRACEd. + */ + int nr_frozen_tasks; +}; + struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; @@ -453,6 +478,9 @@ struct cgroup { /* If there is block congestion on this cgroup. */ atomic_t congestion_count; + /* Used to store internal freezer state */ + struct cgroup_freezer_state freezer; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 81f58b4a5418..3e2efd412dfa 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -881,4 +881,47 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) free_cgroup_ns(ns); } +#ifdef CONFIG_CGROUPS + +void cgroup_enter_frozen(void); +void cgroup_leave_frozen(bool always_leave); +void cgroup_update_frozen(struct cgroup *cgrp); +void cgroup_freeze(struct cgroup *cgrp, bool freeze); +void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, + struct cgroup *dst); +void cgroup_freezer_frozen_exit(struct task_struct *task); +static inline bool cgroup_task_freeze(struct task_struct *task) +{ + bool ret; + + if (task->flags & PF_KTHREAD) + return false; + + rcu_read_lock(); + ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags); + rcu_read_unlock(); + + return ret; +} + +static inline bool cgroup_task_frozen(struct task_struct *task) +{ + return task->frozen; +} + +#else /* !CONFIG_CGROUPS */ + +static inline void cgroup_enter_frozen(void) { } +static inline void cgroup_leave_frozen(bool always_leave) { } +static inline bool cgroup_task_freeze(struct task_struct *task) +{ + return false; +} +static inline bool cgroup_task_frozen(struct task_struct *task) +{ + return false; +} + +#endif /* !CONFIG_CGROUPS */ + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1549584a1538..45b2199bd683 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -726,6 +726,8 @@ struct task_struct { #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; + /* task is frozen/stopped (used by the cgroup freezer) */ + unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP /* to be used once the psi infrastructure lands upstream. */ diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index 98228bd48aee..fa067de9f1a9 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -18,6 +18,7 @@ struct task_struct; #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ +#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -26,6 +27,7 @@ struct task_struct; #define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) +#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) |