diff options
author | Namhyung Kim <namhyung@kernel.org> | 2022-05-18 15:47:24 -0700 |
---|---|---|
committer | Arnaldo Carvalho de Melo <acme@redhat.com> | 2022-05-26 12:36:58 -0300 |
commit | 685439a7a037d8677e3d1acf0302624002ee6a6d (patch) | |
tree | 64d8e50853443d53a67de92bae25724d1f5c1290 /tools | |
parent | b36888f71c8542cd49ecaf29cd1ba874c733b5fe (diff) | |
download | linux-stable-685439a7a037d8677e3d1acf0302624002ee6a6d.tar.gz linux-stable-685439a7a037d8677e3d1acf0302624002ee6a6d.tar.bz2 linux-stable-685439a7a037d8677e3d1acf0302624002ee6a6d.zip |
perf record: Add cgroup support for off-cpu profiling
This covers two different use cases. The first one is cgroup
filtering given by -G/--cgroup option which controls the off-cpu
profiling for tasks in the given cgroups only.
The other use case is cgroup sampling which is enabled by
--all-cgroups option and it adds PERF_SAMPLE_CGROUP to the sample_type
to set the cgroup id of the task in the sample data.
Example output.
$ sudo perf record -a --off-cpu --all-cgroups sleep 1
$ sudo perf report --stdio -s comm,cgroup --call-graph=no
...
# Samples: 144 of event 'offcpu-time'
# Event count (approx.): 48452045427
#
# Children Self Command Cgroup
# ........ ........ ............... ..........................................
#
61.57% 5.60% Chrome_ChildIOT /user.slice/user-657345.slice/user@657345.service/app.slice/...
29.51% 7.38% Web Content /user.slice/user-657345.slice/user@657345.service/app.slice/...
17.48% 1.59% Chrome_IOThread /user.slice/user-657345.slice/user@657345.service/app.slice/...
16.48% 4.12% pipewire-pulse /user.slice/user-657345.slice/user@657345.service/session.slice/...
14.48% 2.07% perf /user.slice/user-657345.slice/user@657345.service/app.slice/...
14.30% 7.15% CompositorTileW /user.slice/user-657345.slice/user@657345.service/app.slice/...
13.33% 6.67% Timer /user.slice/user-657345.slice/user@657345.service/app.slice/...
...
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20220518224725.742882-6-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools')
-rw-r--r-- | tools/perf/builtin-record.c | 2 | ||||
-rw-r--r-- | tools/perf/util/bpf_off_cpu.c | 48 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/off_cpu.bpf.c | 33 | ||||
-rw-r--r-- | tools/perf/util/off_cpu.h | 7 |
4 files changed, 85 insertions, 5 deletions
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 96014387f553..9a71f0330137 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -892,7 +892,7 @@ static int record__config_text_poke(struct evlist *evlist) static int record__config_off_cpu(struct record *rec) { - return off_cpu_prepare(rec->evlist, &rec->opts.target); + return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts); } static bool record__kcore_readable(struct machine *machine) diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c index 874856c55101..b73e84a02264 100644 --- a/tools/perf/util/bpf_off_cpu.c +++ b/tools/perf/util/bpf_off_cpu.c @@ -5,10 +5,12 @@ #include "util/evlist.h" #include "util/off_cpu.h" #include "util/perf-hooks.h" +#include "util/record.h" #include "util/session.h" #include "util/target.h" #include "util/cpumap.h" #include "util/thread_map.h" +#include "util/cgroup.h" #include <bpf/bpf.h> #include "bpf_skel/off_cpu.skel.h" @@ -24,6 +26,7 @@ struct off_cpu_key { u32 tgid; u32 stack_id; u32 state; + u64 cgroup_id; }; union off_cpu_data { @@ -116,10 +119,11 @@ static void check_sched_switch_args(void) } } -int off_cpu_prepare(struct evlist *evlist, struct target *target) +int off_cpu_prepare(struct evlist *evlist, struct target *target, + struct record_opts *opts) { int err, fd, i; - int ncpus = 1, ntasks = 1; + int ncpus = 1, ntasks = 1, ncgrps = 1; if (off_cpu_config(evlist) < 0) { pr_err("Failed to config off-cpu BPF event\n"); @@ -143,6 +147,21 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target) bpf_map__set_max_entries(skel->maps.task_filter, ntasks); } + if (evlist__first(evlist)->cgrp) { + ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */ + bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps); + + if (!cgroup_is_v2("perf_event")) + skel->rodata->uses_cgroup_v1 = true; + } + + if (opts->record_cgroup) { + skel->rodata->needs_cgroup = true; + + if (!cgroup_is_v2("perf_event")) + skel->rodata->uses_cgroup_v1 = true; + } + set_max_rlimit(); check_sched_switch_args(); @@ -178,6 +197,29 @@ int off_cpu_prepare(struct evlist *evlist, struct target *target) } } + if (evlist__first(evlist)->cgrp) { + struct evsel *evsel; + u8 val = 1; + + skel->bss->has_cgroup = 1; + fd = bpf_map__fd(skel->maps.cgroup_filter); + + evlist__for_each_entry(evlist, evsel) { + struct cgroup *cgrp = evsel->cgrp; + + if (cgrp == NULL) + continue; + + if (!cgrp->id && read_cgroup_id(cgrp) < 0) { + pr_err("Failed to read cgroup id of %s\n", + cgrp->name); + goto out; + } + + bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY); + } + } + err = off_cpu_bpf__attach(skel); if (err) { pr_err("Failed to attach off-cpu BPF skeleton\n"); @@ -275,6 +317,8 @@ int off_cpu_write(struct perf_session *session) /* calculate sample callchain data array length */ n += len + 2; } + if (sample_type & PERF_SAMPLE_CGROUP) + data.array[n++] = key.cgroup_id; /* TODO: handle more sample types */ size = n * sizeof(u64); diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c index 986d7db6e75d..792ae2847080 100644 --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c @@ -26,6 +26,7 @@ struct offcpu_key { __u32 tgid; __u32 stack_id; __u32 state; + __u64 cgroup_id; }; struct { @@ -63,6 +64,13 @@ struct { __uint(max_entries, 1); } task_filter SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cgroup_filter SEC(".maps"); + /* old kernel task_struct definition */ struct task_struct___old { long state; @@ -71,8 +79,11 @@ struct task_struct___old { int enabled = 0; int has_cpu = 0; int has_task = 0; +int has_cgroup = 0; const volatile bool has_prev_state = false; +const volatile bool needs_cgroup = false; +const volatile bool uses_cgroup_v1 = false; /* * Old kernel used to call it task_struct->state and now it's '__state'. @@ -92,6 +103,18 @@ static inline int get_task_state(struct task_struct *t) return BPF_CORE_READ(t_old, state); } +static inline __u64 get_cgroup_id(struct task_struct *t) +{ + struct cgroup *cgrp; + + if (uses_cgroup_v1) + cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup); + else + cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp); + + return BPF_CORE_READ(cgrp, kn, id); +} + static inline int can_record(struct task_struct *t, int state) { /* kernel threads don't have user stack */ @@ -120,6 +143,15 @@ static inline int can_record(struct task_struct *t, int state) return 0; } + if (has_cgroup) { + __u8 *ok; + __u64 cgrp_id = get_cgroup_id(t); + + ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); + if (!ok) + return 0; + } + return 1; } @@ -156,6 +188,7 @@ next: .tgid = next->tgid, .stack_id = pelem->stack_id, .state = pelem->state, + .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, }; __u64 delta = ts - pelem->timestamp; __u64 *total; diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h index f47af0232e55..548008f74d42 100644 --- a/tools/perf/util/off_cpu.h +++ b/tools/perf/util/off_cpu.h @@ -4,15 +4,18 @@ struct evlist; struct target; struct perf_session; +struct record_opts; #define OFFCPU_EVENT "offcpu-time" #ifdef HAVE_BPF_SKEL -int off_cpu_prepare(struct evlist *evlist, struct target *target); +int off_cpu_prepare(struct evlist *evlist, struct target *target, + struct record_opts *opts); int off_cpu_write(struct perf_session *session); #else static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused, - struct target *target __maybe_unused) + struct target *target __maybe_unused, + struct record_opts *opts __maybe_unused) { return -1; } |