From cfb766da54d98ceb145e1bb0bd11c559569dcbfc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:04 -0700 Subject: sched/cputime: Expose cputime_adjust() Will be used by basic cgroup resource stat reporting later. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner --- include/linux/sched/cputime.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 4c5b9735c1ae..9251044335c5 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -53,7 +53,8 @@ static inline void task_cputime_scaled(struct task_struct *t, extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); - +extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st); /* * Thread group CPU time accounting. -- cgit v1.2.3 From d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:04 -0700 Subject: cpuacct: Introduce cgroup_account_cputime[_field]() Introduce cgroup_account_cputime[_field]() which wrap cpuacct_charge() and cgroup_account_field(). This doesn't introduce any functional changes and will be used to add cgroup basic resource accounting. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar --- include/linux/cgroup.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d023ac5e377f..6cd579329310 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -688,6 +689,43 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ +/* + * Basic resource stats. + */ +#ifdef CONFIG_CGROUPS + +#ifdef CONFIG_CGROUP_CPUACCT +void cpuacct_charge(struct task_struct *tsk, u64 cputime); +void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +static inline void cpuacct_account_field(struct task_struct *tsk, int index, + u64 val) {} +#endif + +static inline void cgroup_account_cputime(struct task_struct *task, + u64 delta_exec) +{ + cpuacct_charge(task, delta_exec); +} + +static inline void cgroup_account_cputime_field(struct task_struct *task, + enum cpu_usage_stat index, + u64 delta_exec) +{ + cpuacct_account_field(task, index, delta_exec); +} + +#else /* CONFIG_CGROUPS */ + +static inline void cgroup_account_cputime(struct task_struct *task, + u64 delta_exec) {} +static inline void cgroup_account_cputime_field(struct task_struct *task, + enum cpu_usage_stat index, + u64 delta_exec) {} + +#endif /* CONFIG_CGROUPS */ + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. -- cgit v1.2.3 From 041cd640b2f3c5607171c59d8712b503659d21f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:05 -0700 Subject: cgroup: Implement cgroup2 basic CPU usage accounting In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner Cc: Waiman Long Cc: Roman Gushchin --- include/linux/cgroup-defs.h | 57 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/cgroup.h | 22 +++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ade4a78a54c2..3e55bbd31ad1 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -254,6 +255,57 @@ struct css_set { struct rcu_head rcu_head; }; +/* + * cgroup basic resource usage statistics. Accounting is done per-cpu in + * cgroup_cpu_stat which is then lazily propagated up the hierarchy on + * reads. + * + * When a stat gets updated, the cgroup_cpu_stat and its ancestors are + * linked into the updated tree. On the following read, propagation only + * considers and consumes the updated tree. This makes reading O(the + * number of descendants which have been active since last read) instead of + * O(the total number of descendants). + * + * This is important because there can be a lot of (draining) cgroups which + * aren't active and stat may be read frequently. The combination can + * become very expensive. By propagating selectively, increasing reading + * frequency decreases the cost of each read. + */ +struct cgroup_cpu_stat { + /* + * ->sync protects all the current counters. These are the only + * fields which get updated in the hot path. + */ + struct u64_stats_sync sync; + struct task_cputime cputime; + + /* + * Snapshots at the last reading. These are used to calculate the + * deltas to propagate to the global counters. + */ + struct task_cputime last_cputime; + + /* + * Child cgroups with stat updates on this cpu since the last read + * are linked on the parent's ->updated_children through + * ->updated_next. + * + * In addition to being more compact, singly-linked list pointing + * to the cgroup makes it unnecessary for each per-cpu struct to + * point back to the associated cgroup. + * + * Protected by per-cpu cgroup_cpu_stat_lock. + */ + struct cgroup *updated_children; /* terminated by self cgroup */ + struct cgroup *updated_next; /* NULL iff not on the list */ +}; + +struct cgroup_stat { + /* per-cpu statistics are collected into the folowing global counters */ + struct task_cputime cputime; + struct prev_cputime prev_cputime; +}; + struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; @@ -353,6 +405,11 @@ struct cgroup { */ struct cgroup *dom_cgrp; + /* cgroup basic resource statistics */ + struct cgroup_cpu_stat __percpu *cpu_stat; + struct cgroup_stat pending_stat; /* pending from children */ + struct cgroup_stat stat; + /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6cd579329310..328a70ce0e23 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -703,17 +703,39 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif +void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix); + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec); + static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { + struct cgroup *cgrp; + cpuacct_charge(task, delta_exec); + + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + if (cgroup_parent(cgrp)) + __cgroup_account_cputime(cgrp, delta_exec); + rcu_read_unlock(); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { + struct cgroup *cgrp; + cpuacct_account_field(task, index, delta_exec); + + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + if (cgroup_parent(cgrp)) + __cgroup_account_cputime_field(cgrp, index, delta_exec); + rcu_read_unlock(); } #else /* CONFIG_CGROUPS */ -- cgit v1.2.3 From d41bf8c9deaed1a90b18d3ffc5639d4c19f0259a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Oct 2017 16:18:27 -0700 Subject: cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat The basic cpu stat is currently shown with "cpu." prefix in cgroup.stat, and the same information is duplicated in cpu.stat when cpu controller is enabled. This is ugly and not very scalable as we want to expand the coverage of stat information which is always available. This patch makes cgroup core always create "cpu.stat" file and show the basic cpu stat there and calls the cpu controller to show the extra stats when enabled. This ensures that the same information isn't presented in multiple places and makes future expansion of basic stats easier. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) --- include/linux/cgroup-defs.h | 2 ++ include/linux/cgroup.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 3e55bbd31ad1..ada6df7b1f55 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -569,6 +569,8 @@ struct cgroup_subsys { void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); + int (*css_extra_stat_show)(struct seq_file *seq, + struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 328a70ce0e23..03cad08b09d1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -703,8 +703,6 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif -void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix); - void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); -- cgit v1.2.3