From 6fe7c745f2acb73e4cc961d7f91125eef5a8861f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:14 +0900 Subject: tracing/boot: Fix a hist trigger dependency for boot time tracing Fixes a build error when CONFIG_HIST_TRIGGERS=n with boot-time tracing. Since the trigger_process_regex() is defined only when CONFIG_HIST_TRIGGERS=y, if it is disabled, the 'actions' event option also must be disabled. Link: https://lkml.kernel.org/r/162856123376.203126.582144262622247352.stgit@devnote2 Fixes: 81a59555ff15 ("tracing/boot: Add per-event settings") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 94ef2d099e32..d713714cba67 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -205,12 +205,15 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, pr_err("Failed to apply filter: %s\n", buf); } - xbc_node_for_each_array_value(enode, "actions", anode, p) { - if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) - pr_err("action string is too long: %s\n", p); - else if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply an action: %s\n", buf); - } + if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { + xbc_node_for_each_array_value(enode, "actions", anode, p) { + if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) + pr_err("action string is too long: %s\n", p); + else if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply an action: %s\n", buf); + } + } else if (xbc_node_find_value(enode, "actions", NULL)) + pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); if (xbc_node_find_value(enode, "enable", NULL)) { if (trace_event_enable_disable(file, 1, 0) < 0) -- cgit v1.2.3 From de9a48a360b70d5318061cf1237431d1869555e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Jul 2021 17:36:24 -0400 Subject: tracing: Add linear buckets to histogram logic There's been several times I wished the histogram logic had a "grouping" feature for the buckets. Currently, each bucket has a size of one. That is, if you trace the amount of requested allocations, each allocation is its own bucket, even if you are interested in what allocates 100 bytes or less, 100 to 200, 200 to 300, etc. Also, without grouping, it fills up the allocated histogram buckets quickly. If you are tracking latency, and don't care if something is 200 microseconds off, or 201 microseconds off, but want to track them by say 10 microseconds each. This can not currently be done. There is a log2 but that grouping get's too big too fast for a lot of cases. Introduce a "buckets=SIZE" command to each field where it will record in a rounded number. For example: ># echo 'hist:keys=bytes_req.buckets=100:sort=bytes_req' > events/kmem/kmalloc/trigger ># cat events/kmem/kmalloc/hist # event histogram # # trigger info: hist:keys=bytes_req.buckets=100:vals=hitcount:sort=bytes_req.buckets=100:size=2048 [active] # { bytes_req: ~ 0-99 } hitcount: 3149 { bytes_req: ~ 100-199 } hitcount: 1468 { bytes_req: ~ 200-299 } hitcount: 39 { bytes_req: ~ 300-399 } hitcount: 306 { bytes_req: ~ 400-499 } hitcount: 364 { bytes_req: ~ 500-599 } hitcount: 32 { bytes_req: ~ 600-699 } hitcount: 69 { bytes_req: ~ 700-799 } hitcount: 37 { bytes_req: ~ 1200-1299 } hitcount: 16 { bytes_req: ~ 1400-1499 } hitcount: 30 { bytes_req: ~ 2000-2099 } hitcount: 6 { bytes_req: ~ 4000-4099 } hitcount: 2168 { bytes_req: ~ 5000-5099 } hitcount: 6 Totals: Hits: 7690 Entries: 13 Dropped: 0 Link: https://lkml.kernel.org/r/20210707213921.980359719@goodmis.org Acked-by: Namhyung Kim Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Tested-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 65 +++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a48aa2a2875b..8e87c4a429fd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -121,6 +121,7 @@ struct hist_field { unsigned int size; unsigned int offset; unsigned int is_signed; + unsigned long buckets; const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; @@ -219,6 +220,27 @@ static u64 hist_field_log2(struct hist_field *hist_field, return (u64) ilog2(roundup_pow_of_two(val)); } +static u64 hist_field_bucket(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + unsigned long buckets = hist_field->buckets; + + u64 val = operand->fn(operand, elt, buffer, rbe, event); + + if (WARN_ON_ONCE(!buckets)) + return val; + + if (val >= LONG_MAX) + val = div64_ul(val, buckets); + else + val = (u64)((unsigned long)val / buckets); + return val * buckets; +} + static u64 hist_field_plus(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -318,6 +340,7 @@ enum hist_field_flags { HIST_FIELD_FL_VAR_REF = 1 << 14, HIST_FIELD_FL_CPU = 1 << 15, HIST_FIELD_FL_ALIAS = 1 << 16, + HIST_FIELD_FL_BUCKET = 1 << 17, }; struct var_defs { @@ -1109,7 +1132,8 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; else if (field->flags & HIST_FIELD_FL_LOG2 || - field->flags & HIST_FIELD_FL_ALIAS) + field->flags & HIST_FIELD_FL_ALIAS || + field->flags & HIST_FIELD_FL_BUCKET) field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; @@ -1470,6 +1494,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "syscall"; else if (hist_field->flags & HIST_FIELD_FL_LOG2) flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_BUCKET) + flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; @@ -1658,9 +1684,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } - if (flags & HIST_FIELD_FL_LOG2) { - unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; - hist_field->fn = hist_field_log2; + if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { + unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); + hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 : + hist_field_bucket; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); @@ -1953,7 +1980,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, static struct ftrace_event_field * parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, - char *field_str, unsigned long *flags) + char *field_str, unsigned long *flags, unsigned long *buckets) { struct ftrace_event_field *field = NULL; char *field_name, *modifier, *str; @@ -1980,7 +2007,22 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_LOG2; else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; - else { + else if (strncmp(modifier, "bucket", 6) == 0) { + int ret; + + modifier += 6; + + if (*modifier == 's') + modifier++; + if (*modifier != '=') + goto error; + modifier++; + ret = kstrtoul(modifier, 0, buckets); + if (ret || !(*buckets)) + goto error; + *flags |= HIST_FIELD_FL_BUCKET; + } else { + error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; @@ -2049,6 +2091,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; struct ftrace_event_field *field = NULL; struct hist_field *hist_field = NULL; + unsigned long buckets = 0; int ret = 0; s = strchr(str, '.'); @@ -2086,7 +2129,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, } else str = s; - field = parse_field(hist_data, file, str, flags); + field = parse_field(hist_data, file, str, flags, &buckets); if (IS_ERR(field)) { ret = PTR_ERR(field); goto out; @@ -2097,6 +2140,7 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, ret = -ENOMEM; goto out; } + hist_field->buckets = buckets; return hist_field; out: @@ -4698,6 +4742,11 @@ static void hist_trigger_print_key(struct seq_file *m, } else if (key_field->flags & HIST_FIELD_FL_LOG2) { seq_printf(m, "%s: ~ 2^%-2llu", field_name, *(u64 *)(key + key_field->offset)); + } else if (key_field->flags & HIST_FIELD_FL_BUCKET) { + unsigned long buckets = key_field->buckets; + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: ~ %llu-%llu", field_name, + uval, uval + buckets -1); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", field_name, (char *)(key + key_field->offset)); @@ -5137,6 +5186,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, ".%s", flags); } } + if (hist_field->buckets) + seq_printf(m, "=%ld", hist_field->buckets); } static int event_hist_trigger_print(struct seq_file *m, -- cgit v1.2.3 From 370364351926e4fcc7c1a486901bfaae0172b7d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 7 Jul 2021 17:36:25 -0400 Subject: tracing/histogram: Update the documentation for the buckets modifier Update both the tracefs README file as well as the histogram.rst to include an explanation of what the buckets modifier is and how to use it. Include an example with the wakeup_latency example for both log2 and the buckets modifiers as there was no existing log2 example. Link: https://lkml.kernel.org/r/20210707213922.167218794@goodmis.org Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a1adb29ef5c1..be0169594de5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5654,6 +5654,7 @@ static const char readme_msg[] = "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" + "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" -- cgit v1.2.3 From 3347d80baa41c357cf263923f60aa8051a753d76 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 10:27:06 -0400 Subject: tracing: Have histogram types be constant when possible Instead of kstrdup("const", GFP_KERNEL), have the hist_field type simply assign the constant hist_field->type = "const"; And when the value passed to it is a variable, use "kstrdup_const(var, GFP_KERNEL);" which will just copy the value if the variable is already a constant. This saves on having to allocate when not needed. All frees of the hist_field->type will need to use kfree_const(). Link: https://lkml.kernel.org/r/20210722142837.280718447@goodmis.org Suggested-by: Masami Hiramatsu Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8e87c4a429fd..bb466a82b938 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1616,7 +1616,9 @@ static void __destroy_hist_field(struct hist_field *hist_field) kfree(hist_field->var.name); kfree(hist_field->name); - kfree(hist_field->type); + + /* Can likely be a const */ + kfree_const(hist_field->type); kfree(hist_field->system); kfree(hist_field->event_name); @@ -1673,9 +1675,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_HITCOUNT) { hist_field->fn = hist_field_counter; hist_field->size = sizeof(u64); - hist_field->type = kstrdup("u64", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "u64"; goto out; } @@ -1690,7 +1690,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field_bucket; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; - hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); + hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); if (!hist_field->type) goto free; goto out; @@ -1699,18 +1699,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (flags & HIST_FIELD_FL_TIMESTAMP) { hist_field->fn = hist_field_timestamp; hist_field->size = sizeof(u64); - hist_field->type = kstrdup("u64", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CPU) { hist_field->fn = hist_field_cpu; hist_field->size = sizeof(int); - hist_field->type = kstrdup("unsigned int", GFP_KERNEL); - if (!hist_field->type) - goto free; + hist_field->type = "unsigned int"; goto out; } @@ -1723,7 +1719,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_STRING; hist_field->size = MAX_FILTER_STR_VAL; - hist_field->type = kstrdup(field->type, GFP_KERNEL); + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); if (!hist_field->type) goto free; @@ -1736,7 +1732,7 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; - hist_field->type = kstrdup(field->type, GFP_KERNEL); + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); if (!hist_field->type) goto free; @@ -1822,7 +1818,7 @@ static int init_var_ref(struct hist_field *ref_field, } } - ref_field->type = kstrdup(var_field->type, GFP_KERNEL); + ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL); if (!ref_field->type) { err = -ENOMEM; goto free; @@ -2215,7 +2211,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operator = FIELD_OP_UNARY_MINUS; expr->name = expr_str(expr, 0); - expr->type = kstrdup(operand1->type, GFP_KERNEL); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; goto free; @@ -2355,7 +2351,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operator = field_op; expr->name = expr_str(expr, 0); - expr->type = kstrdup(operand1->type, GFP_KERNEL); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; goto free; @@ -2743,10 +2739,10 @@ static struct hist_field *create_var(struct hist_trigger_data *hist_data, var->var.hist_data = var->hist_data = hist_data; var->size = size; var->var.name = kstrdup(name, GFP_KERNEL); - var->type = kstrdup(type, GFP_KERNEL); + var->type = kstrdup_const(type, GFP_KERNEL); if (!var->var.name || !var->type) { + kfree_const(var->type); kfree(var->var.name); - kfree(var->type); kfree(var); var = ERR_PTR(-ENOMEM); } -- cgit v1.2.3 From ed2cf90735daf40ab8d938b4b6d3ca43c0f84466 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jul 2021 10:27:07 -0400 Subject: tracing: Allow execnames to be passed as args for synthetic events Allow common_pid.execname to be saved in a variable in one histogram to be passed to another histogram that can pass it as a parameter to a synthetic event. ># echo 'hist:keys=pid:__arg__1=common_timestamp.usecs:arg2=common_pid.execname' \ > events/sched/sched_waking/trigger ># echo 'wakeup_lat s32 pid; u64 delta; char wake_comm[]' > synthetic_events ># echo 'hist:keys=next_pid:pid=next_pid,delta=common_timestamp.usecs-$__arg__1,exec=$arg2'\ ':onmatch(sched.sched_waking).trace(wakeup_lat,$pid,$delta,$exec)' \ > events/sched/sched_switch/trigger The above is a wake up latency synthetic event setup that passes the execname of the common_pid that woke the task to the scheduling of that task, which triggers a synthetic event that passes the original execname as a parameter to display it. ># echo 1 > events/synthetic/enable ># cat trace -0 [006] d..4 186.863801: wakeup_lat: pid=1306 delta=65 wake_comm=kworker/u16:3 -0 [000] d..4 186.863858: wakeup_lat: pid=163 delta=27 wake_comm= -0 [001] d..4 186.863903: wakeup_lat: pid=1307 delta=36 wake_comm=kworker/u16:4 -0 [000] d..4 186.863927: wakeup_lat: pid=163 delta=5 wake_comm= -0 [006] d..4 186.863957: wakeup_lat: pid=1306 delta=24 wake_comm=kworker/u16:3 sshd-1306 [006] d..4 186.864051: wakeup_lat: pid=61 delta=62 wake_comm= -0 [000] d..4 186.965030: wakeup_lat: pid=609 delta=18 wake_comm= -0 [006] d..4 186.987582: wakeup_lat: pid=1306 delta=65 wake_comm=kworker/u16:3 -0 [000] d..4 186.987639: wakeup_lat: pid=163 delta=27 wake_comm= Link: https://lkml.kernel.org/r/20210722142837.458596338@goodmis.org Reviewed-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 46 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bb466a82b938..9d91b1c06957 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1420,17 +1420,17 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) struct hist_trigger_data *hist_data = elt->map->private_data; unsigned int size = TASK_COMM_LEN; struct hist_elt_data *elt_data; - struct hist_field *key_field; + struct hist_field *hist_field; unsigned int i, n_str; elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); if (!elt_data) return -ENOMEM; - for_each_hist_key_field(i, hist_data) { - key_field = hist_data->fields[i]; + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; - if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + if (hist_field->flags & HIST_FIELD_FL_EXECNAME) { elt_data->comm = kzalloc(size, GFP_KERNEL); if (!elt_data->comm) { kfree(elt_data); @@ -3771,6 +3771,41 @@ static int create_val_field(struct hist_trigger_data *hist_data, return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); } +static const char *no_comm = "(no comm)"; + +static u64 hist_field_execname(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + + if (WARN_ON_ONCE(!elt)) + return (u64)(unsigned long)no_comm; + + elt_data = elt->private_data; + + if (WARN_ON_ONCE(!elt_data->comm)) + return (u64)(unsigned long)no_comm; + + return (u64)(unsigned long)(elt_data->comm); +} + +/* Convert a var that points to common_pid.execname to a string */ +static void update_var_execname(struct hist_field *hist_field) +{ + hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR | + HIST_FIELD_FL_EXECNAME; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->is_signed = 0; + + kfree_const(hist_field->type); + hist_field->type = "char[]"; + + hist_field->fn = hist_field_execname; +} + static int create_var_field(struct hist_trigger_data *hist_data, unsigned int val_idx, struct trace_event_file *file, @@ -3795,6 +3830,9 @@ static int create_var_field(struct hist_trigger_data *hist_data, ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME) + update_var_execname(hist_data->fields[val_idx]); + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; -- cgit v1.2.3 From de32951b29be3d6cc7a92bfbf366f48a9f4c4407 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 31 Jul 2021 14:22:31 +0900 Subject: tracing: Simplify the Kconfig dependency of FTRACE The entire FTRACE block is surrounded by 'if TRACING_SUPPORT' ... 'endif'. Using 'depends on' is a simpler way to guard FTRACE. Link: https://lkml.kernel.org/r/20210731052233.4703-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3ee23f4d437f..420ff4bc67fd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -135,10 +135,9 @@ config TRACING_SUPPORT depends on STACKTRACE_SUPPORT default y -if TRACING_SUPPORT - menuconfig FTRACE bool "Tracers" + depends on TRACING_SUPPORT default y if DEBUG_KERNEL help Enable the kernel tracing infrastructure. @@ -1037,6 +1036,3 @@ config HIST_TRIGGERS_DEBUG If unsure, say N. endif # FTRACE - -endif # TRACING_SUPPORT - -- cgit v1.2.3 From e66ed86ca6c52488249e95f7b3a6a3d7d6ab5e1e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:21 +0900 Subject: tracing/boot: Add per-event histogram action options Add a hist-trigger action syntax support to boot-time tracing. Currently, boot-time tracing supports per-event actions as option strings. However, for the histogram action, it has a special syntax and usually needs a long action definition. To make it readable and fit to the bootconfig syntax, this introduces a new options for histogram. Here are the histogram action options for boot-time tracing. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { keys = [,...] values = [,...] sort = [,...] size = name = var { = ... } pause|continue|clear onmax|onchange { var = ; [= ] } onmatch { event = ; [= ] } filter = } Where is one of below; trace = , [, ...] save = [, ...] snapshot Link: https://lkml.kernel.org/r/162856124106.203126.10501871028479029087.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d713714cba67..3d0e51368f51 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -171,6 +171,231 @@ trace_boot_add_synth_event(struct xbc_node *node, const char *event) } #endif +#ifdef CONFIG_HIST_TRIGGERS +static int __init __printf(3, 4) +append_printf(char **bufp, char *end, const char *fmt, ...) +{ + va_list args; + int ret; + + if (*bufp == end) + return -ENOSPC; + + va_start(args, fmt); + ret = vsnprintf(*bufp, end - *bufp, fmt, args); + if (ret < end - *bufp) { + *bufp += ret; + } else { + *bufp = end; + ret = -ERANGE; + } + va_end(args); + + return ret; +} + +static int __init +append_str_nospace(char **bufp, char *end, const char *str) +{ + char *p = *bufp; + int len; + + while (p < end - 1 && *str != '\0') { + if (!isspace(*str)) + *(p++) = *str; + str++; + } + *p = '\0'; + if (p == end - 1) { + *bufp = end; + return -ENOSPC; + } + len = p - *bufp; + *bufp = p; + return (int)len; +} + +static int __init +trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, + char *end, const char *key) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + knode = xbc_node_find_child(hnode, key); + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode) { + pr_err("hist.%s requires value(s).\n", key); + return -EINVAL; + } + + append_printf(bufp, end, ":%s", key); + sep = '='; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '=') + sep = ','; + } + } else + return -ENOENT; + + return 0; +} + +static int __init +trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + /* Compose 'handler' parameter */ + p = xbc_node_find_value(hnode, param, NULL); + if (!p) { + pr_err("hist.%s requires '%s' option.\n", + xbc_node_get_data(hnode), param); + return -EINVAL; + } + append_printf(bufp, end, ":%s(%s)", xbc_node_get_data(hnode), p); + + /* Compose 'action' parameter */ + knode = xbc_node_find_child(hnode, "trace"); + if (!knode) + knode = xbc_node_find_child(hnode, "save"); + + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode || !xbc_node_is_value(anode)) { + pr_err("hist.%s.%s requires value(s).\n", + xbc_node_get_data(hnode), + xbc_node_get_data(knode)); + return -EINVAL; + } + + append_printf(bufp, end, ".%s", xbc_node_get_data(knode)); + sep = '('; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '(') + sep = ','; + } + append_printf(bufp, end, ")"); + } else if (xbc_node_find_child(hnode, "snapshot")) { + append_printf(bufp, end, ".snapshot()"); + } else { + pr_err("hist.%s requires an action.\n", + xbc_node_get_data(hnode)); + return -EINVAL; + } + + return 0; +} + +/* + * Histogram boottime tracing syntax. + * + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { + * keys = [,...] + * values = [,...] + * sort = [,...] + * size = + * name = + * var { = ... } + * pause|continue|clear + * onmax|onchange { var = ; [= ] } + * onmatch { event = ; [= ] } + * filter = + * } + * + * Where are; + * + * trace = , [, ...] + * save = [, ...] + * snapshot + */ +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node, *knode; + char *end = buf + size; + const char *p; + int ret = 0; + + append_printf(&buf, end, "hist"); + + ret = trace_boot_hist_add_array(hnode, &buf, end, "keys"); + if (ret < 0) { + if (ret == -ENOENT) + pr_err("hist requires keys.\n"); + return -EINVAL; + } + + ret = trace_boot_hist_add_array(hnode, &buf, end, "values"); + if (ret == -EINVAL) + return ret; + ret = trace_boot_hist_add_array(hnode, &buf, end, "sort"); + if (ret == -EINVAL) + return ret; + + p = xbc_node_find_value(hnode, "size", NULL); + if (p) + append_printf(&buf, end, ":size=%s", p); + + p = xbc_node_find_value(hnode, "name", NULL); + if (p) + append_printf(&buf, end, ":name=%s", p); + + node = xbc_node_find_child(hnode, "var"); + if (node) { + xbc_node_for_each_key_value(node, knode, p) { + /* Expression must not include spaces. */ + append_printf(&buf, end, ":%s=", + xbc_node_get_data(knode)); + append_str_nospace(&buf, end, p); + } + } + + /* Histogram control attributes (mutual exclusive) */ + if (xbc_node_find_child(hnode, "pause")) + append_printf(&buf, end, ":pause"); + else if (xbc_node_find_child(hnode, "continue")) + append_printf(&buf, end, ":continue"); + else if (xbc_node_find_child(hnode, "clear")) + append_printf(&buf, end, ":clear"); + + /* Histogram handler and actions */ + node = xbc_node_find_child(hnode, "onmax"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_child(hnode, "onchange"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_child(hnode, "onmatch"); + if (node && trace_boot_hist_add_handler(node, &buf, end, "event") < 0) + return -EINVAL; + + p = xbc_node_find_value(hnode, "filter", NULL); + if (p) + append_printf(&buf, end, " if %s", p); + + if (buf == end) { + pr_err("hist exceeds the max command length.\n"); + return -E2BIG; + } + + return 0; +} +#else +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + return -EOPNOTSUPP; +} +#endif + static void __init trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, struct xbc_node *enode) @@ -212,6 +437,12 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", buf); } + anode = xbc_node_find_child(enode, "hist"); + if (anode && + trace_boot_compose_hist_cmd(anode, buf, ARRAY_SIZE(buf)) == 0) { + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } } else if (xbc_node_find_value(enode, "actions", NULL)) pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); -- cgit v1.2.3 From 8993665abcce793f00815b3504a486dce70cc2b9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:29 +0900 Subject: tracing/boot: Support multiple handlers for per-event histogram Support multiple handlers for per-event histogram in boot-time tracing. Since the histogram can register multiple same handler-actions with different parameters, this expands the syntax to support such cases. With this update, the 'onmax', 'onchange' and 'onmatch' handler subkeys under per-event histogram option will take a number subkeys optionally as below. (see [.N]) ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { onmax|onchange[.N] { var = ; [= ] } onmatch[.N] { event = ; [= ] } } The 'N' must be a digit (or digit started word). Thus user can add several handler-actions to the histogram, for example, ftrace.event.SOMEGROUP.SOMEEVENT.hist { keys = SOME_ID; lat = common_timestamp.usecs-$ts0 onmatch.1 { event = GROUP1.STARTEVENT1 trace = latency_event, SOME_ID, $lat } onmatch.2 { event = GROUP2.STARTEVENT2 trace = latency_event, SOME_ID, $lat } } Then, it can trace the elapsed time from GROUP1.STARTEVENT1 to SOMEGROUP.SOMEEVENT, and from GROUP2.STARTEVENT2 to SOMEGROUP.SOMEEVENT with SOME_ID key. Link: https://lkml.kernel.org/r/162856124905.203126.14913731908137885922.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3d0e51368f51..f024f27b3602 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -245,8 +245,9 @@ trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, } static int __init -trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, - char *end, const char *param) +trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *handler, + const char *param) { struct xbc_node *knode, *anode; const char *p; @@ -259,7 +260,7 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, xbc_node_get_data(hnode), param); return -EINVAL; } - append_printf(bufp, end, ":%s(%s)", xbc_node_get_data(hnode), p); + append_printf(bufp, end, ":%s(%s)", handler, p); /* Compose 'action' parameter */ knode = xbc_node_find_child(hnode, "trace"); @@ -294,6 +295,32 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, return 0; } +static int __init +trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *node; + const char *p, *handler; + int ret; + + handler = xbc_node_get_data(hnode); + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param); + if (ret < 0) + break; + } + + if (xbc_node_find_child(hnode, param)) + ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); + + return ret; +} + /* * Histogram boottime tracing syntax. * @@ -305,8 +332,8 @@ trace_boot_hist_add_handler(struct xbc_node *hnode, char **bufp, * name = * var { = ... } * pause|continue|clear - * onmax|onchange { var = ; [= ] } - * onmatch { event = ; [= ] } + * onmax|onchange[.N] { var = ; [= ] } + * onmatch[.N] { event = ; [= ] } * filter = * } * @@ -368,13 +395,13 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) /* Histogram handler and actions */ node = xbc_node_find_child(hnode, "onmax"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; node = xbc_node_find_child(hnode, "onchange"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "var") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; node = xbc_node_find_child(hnode, "onmatch"); - if (node && trace_boot_hist_add_handler(node, &buf, end, "event") < 0) + if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) return -EINVAL; p = xbc_node_find_value(hnode, "filter", NULL); -- cgit v1.2.3 From 17abd7c36c77c393fa65cde462059395d6437dba Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:36 +0900 Subject: tracing/boot: Support multiple histograms for each event Add multiple histograms support for each event. This allows user to set multiple histograms to an event. ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { ... } The 'N' is a digit started string and it can be omitted for the default histogram. For example, multiple hist triggers example in the Documentation/trace/histogram.rst can be written as below; ftrace.event.net.netif_receive_skb.hist { 1 { keys = skbaddr.hex values = len filter = len < 0 } 2 { keys = skbaddr.hex values = len filter = len > 4096 } 3 { keys = skbaddr.hex values = len filter = len == 256 } 4 { keys = skbaddr.hex values = len } 5 { keys = len values = common_preempt_count } } Link: https://lkml.kernel.org/r/162856125628.203126.15846930277378572120.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index f024f27b3602..1a2b270e9cda 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -324,7 +324,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, /* * Histogram boottime tracing syntax. * - * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist { + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { * keys = [,...] * values = [,...] * sort = [,...] @@ -415,11 +415,37 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) return 0; } + +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node; + const char *p; + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } + } + + if (xbc_node_find_child(hnode, "keys")) { + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", buf); + } +} #else -static int __init -trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) { - return -EOPNOTSUPP; + /* do nothing */ } #endif @@ -465,11 +491,8 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, pr_err("Failed to apply an action: %s\n", buf); } anode = xbc_node_find_child(enode, "hist"); - if (anode && - trace_boot_compose_hist_cmd(anode, buf, ARRAY_SIZE(buf)) == 0) { - if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); - } + if (anode) + trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); } else if (xbc_node_find_value(enode, "actions", NULL)) pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); -- cgit v1.2.3 From 64dc7f6958ef56512137ed6ec228127cef7724e9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 10 Aug 2021 11:07:44 +0900 Subject: tracing/boot: Show correct histogram error command Since trigger_process_regex() modifies given trigger actions while parsing, the error message couldn't show what command was passed to the trigger_process_regex() when it returns an error. To fix that, show the backed up trigger action command instead of parsed buffer. Link: https://lkml.kernel.org/r/162856126413.203126.9465564928450701424.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 1a2b270e9cda..1060b0446032 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -422,6 +422,7 @@ trace_boot_init_histograms(struct trace_event_file *file, { struct xbc_node *node; const char *p; + char *tmp; xbc_node_for_each_subkey(hnode, node) { p = xbc_node_get_data(node); @@ -429,15 +430,20 @@ trace_boot_init_histograms(struct trace_event_file *file, continue; /* All digit started node should be instances. */ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); } } if (xbc_node_find_child(hnode, "keys")) { - if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply hist trigger: %s\n", buf); + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); + } } } #else @@ -488,7 +494,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, if (strlcpy(buf, p, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf)) pr_err("action string is too long: %s\n", p); else if (trigger_process_regex(file, buf) < 0) - pr_err("Failed to apply an action: %s\n", buf); + pr_err("Failed to apply an action: %s\n", p); } anode = xbc_node_find_child(enode, "hist"); if (anode) -- cgit v1.2.3 From 99c37d1a63eafcd3673302a7953df760b46d0f6f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:19 +0200 Subject: tracing: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Link: https://lkml.kernel.org/r/20210803141621.780504-37-bigeasy@linutronix.de Cc: Peter Zijlstra Cc: Ingo Molnar Acked-by: Daniel Bristot de Oliveira Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 8 ++++---- kernel/trace/trace_hwlat.c | 28 ++++++++++++++-------------- kernel/trace/trace_osnoise.c | 16 ++++++++-------- 3 files changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e592d1df6f88..c5a3fbf19617 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2111,7 +2111,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } } - get_online_cpus(); + cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2143,7 +2143,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - put_online_cpus(); + cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2171,7 +2171,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - get_online_cpus(); + cpus_read_lock(); /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) @@ -2183,7 +2183,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - put_online_cpus(); + cpus_read_unlock(); } out: diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 14f46aae1981..1b83d75eb103 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -325,10 +325,10 @@ static void move_to_next_cpu(void) if (!cpumask_equal(current_mask, current->cpus_ptr)) goto change_mode; - get_online_cpus(); + cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); - put_online_cpus(); + cpus_read_unlock(); if (next_cpu >= nr_cpu_ids) next_cpu = cpumask_first(current_mask); @@ -398,7 +398,7 @@ static void stop_single_kthread(void) struct hwlat_kthread_data *kdata = get_cpu_data(); struct task_struct *kthread; - get_online_cpus(); + cpus_read_lock(); kthread = kdata->kthread; if (!kthread) @@ -408,7 +408,7 @@ static void stop_single_kthread(void) kdata->kthread = NULL; out_put_cpus: - put_online_cpus(); + cpus_read_unlock(); } @@ -425,14 +425,14 @@ static int start_single_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - get_online_cpus(); + cpus_read_lock(); if (kdata->kthread) goto out_put_cpus; kthread = kthread_create(kthread_fn, NULL, "hwlatd"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - put_online_cpus(); + cpus_read_unlock(); return -ENOMEM; } @@ -452,7 +452,7 @@ static int start_single_kthread(struct trace_array *tr) wake_up_process(kthread); out_put_cpus: - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -479,10 +479,10 @@ static void stop_per_cpu_kthreads(void) { unsigned int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) stop_cpu_kthread(cpu); - put_online_cpus(); + cpus_read_unlock(); } /* @@ -515,7 +515,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) mutex_lock(&trace_types_lock); mutex_lock(&hwlat_data.lock); - get_online_cpus(); + cpus_read_lock(); if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; @@ -526,7 +526,7 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) start_cpu_kthread(cpu); out_unlock: - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&hwlat_data.lock); mutex_unlock(&trace_types_lock); } @@ -582,7 +582,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) unsigned int cpu; int retval; - get_online_cpus(); + cpus_read_lock(); /* * Run only on CPUs in which hwlat is allowed to run. */ @@ -596,12 +596,12 @@ static int start_per_cpu_kthreads(struct trace_array *tr) if (retval) goto out_error; } - put_online_cpus(); + cpus_read_unlock(); return 0; out_error: - put_online_cpus(); + cpus_read_unlock(); stop_per_cpu_kthreads(); return retval; } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b61eefe5ccf5..65b08b8e5bf8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1498,12 +1498,12 @@ static void stop_per_cpu_kthreads(void) { int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) stop_kthread(cpu); - put_online_cpus(); + cpus_read_unlock(); } /* @@ -1551,7 +1551,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) int retval; int cpu; - get_online_cpus(); + cpus_read_lock(); /* * Run only on CPUs in which trace and osnoise are allowed to run. */ @@ -1572,7 +1572,7 @@ static int start_per_cpu_kthreads(struct trace_array *tr) } } - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1590,7 +1590,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) goto out_unlock_trace; mutex_lock(&interface_lock); - get_online_cpus(); + cpus_read_lock(); if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) goto out_unlock; @@ -1601,7 +1601,7 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) start_kthread(cpu); out_unlock: - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&interface_lock); out_unlock_trace: mutex_unlock(&trace_types_lock); @@ -1743,11 +1743,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, /* * osnoise_cpumask is read by CPU hotplug operations. */ - get_online_cpus(); + cpus_read_lock(); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&interface_lock); if (running) -- cgit v1.2.3 From 8b0e6c744fef6462382041b30878c91f15069fc6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:56 -0400 Subject: tracing: Add DYNAMIC flag for dynamic events To differentiate between static and dynamic events, add a new flag DYNAMIC to the event flags that all dynamic events have set. This will allow to differentiate when attaching to a dynamic event from a static event. Static events have a mod pointer that references the module they were created in (or NULL for core kernel). This can be incremented when the event has something attached to it. But there exists no such mechanism for dynamic events. This is dangerous as the dynamic events may now disappear without the "attachment" knowing that it no longer exists. To enforce the dynamic flag, change dyn_event_add() to pass the event that is being created such that it can set the DYNAMIC flag of the event. This helps make sure that no location that creates a dynamic event misses setting this flag. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035026.936958254@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_dynevent.h | 4 +++- kernel/trace/trace_events_synth.c | 2 +- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_uprobe.c | 4 ++-- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 7754936b57ee..936477a111d3 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -76,13 +76,15 @@ int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops) return 0; } -static inline int dyn_event_add(struct dyn_event *ev) +static inline int dyn_event_add(struct dyn_event *ev, + struct trace_event_call *call) { lockdep_assert_held(&event_mutex); if (!ev || !ev->ops) return -EINVAL; + call->flags |= TRACE_EVENT_FL_DYNAMIC; list_add_tail(&ev->list, &dyn_event_list); return 0; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 9315fc03e303..f4f5489e1e28 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1298,7 +1298,7 @@ static int __create_synth_event(const char *name, const char *raw_fields) } ret = register_synth_event(event); if (!ret) - dyn_event_add(&event->devent); + dyn_event_add(&event->devent, &event->call); else free_synth_event(event); out: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ea6178cb5e33..bfef43bfce37 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -618,7 +618,7 @@ static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) if (ret) trace_probe_unlink(&tk->tp); else - dyn_event_add(&tk->devent); + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); return ret; } @@ -661,7 +661,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (ret < 0) unregister_kprobe_event(tk); else - dyn_event_add(&tk->devent); + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); end: mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9b50869a5ddb..50eca53b8d22 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -455,7 +455,7 @@ static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) /* Append to existing event */ ret = trace_probe_append(&tu->tp, &to->tp); if (!ret) - dyn_event_add(&tu->devent); + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); return ret; } @@ -518,7 +518,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) goto end; } - dyn_event_add(&tu->devent); + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); end: mutex_unlock(&event_mutex); -- cgit v1.2.3 From 1d18538e6a09265003a0a94ca779d7a6127cb76c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:57 -0400 Subject: tracing: Have dynamic events have a ref counter As dynamic events are not created by modules, if something is attached to one, calling "try_module_get()" on its "mod" field, is not going to keep the dynamic event from going away. Since dynamic events do not need the "mod" pointer of the event structure, make a union out of it in order to save memory (there's one structure for each of the thousand+ events in the kernel), and have any event with the DYNAMIC flag set to use a ref counter instead. Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.174869074@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_dynevent.c | 38 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_event_perf.c | 6 +++--- kernel/trace/trace_events.c | 22 +++++++++++++-------- kernel/trace/trace_events_synth.c | 19 ++++++++++++------- kernel/trace/trace_events_trigger.c | 6 +++--- kernel/trace/trace_kprobe.c | 4 ++++ kernel/trace/trace_uprobe.c | 4 ++++ 8 files changed, 80 insertions(+), 23 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be0169594de5..8425c3d70895 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3697,11 +3697,11 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str) return false; event = container_of(trace_event, struct trace_event_call, event); - if (!event->mod) + if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) return false; /* Would rather have rodata, but this will suffice */ - if (within_module_core(addr, event->mod)) + if (within_module_core(addr, event->module)) return true; return false; diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index e57cc0870892..1110112e55bd 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -13,11 +13,49 @@ #include #include "trace.h" +#include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" static DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); +bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) +{ + struct trace_event_call *call; + bool ret = false; + + if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC))) + return false; + + down_read(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { + if (call == dyn_call) { + atomic_inc(&dyn_call->refcnt); + ret = true; + } + } + up_read(&trace_event_sem); + return ret; +} + +void trace_event_dyn_put_ref(struct trace_event_call *call) +{ + if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC))) + return; + + if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) { + atomic_set(&call->refcnt, 0); + return; + } + + atomic_dec(&call->refcnt); +} + +bool trace_event_dyn_busy(struct trace_event_call *call) +{ + return atomic_read(&call->refcnt) != 0; +} + int dyn_event_register(struct dyn_event_operations *ops) { if (!ops || !ops->create || !ops->show || !ops->is_busy || diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 03be4435d103..6aed10e2f7ce 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -177,7 +177,7 @@ static void perf_trace_event_unreg(struct perf_event *p_event) } } out: - module_put(tp_event->mod); + trace_event_put_ref(tp_event); } static int perf_trace_event_open(struct perf_event *p_event) @@ -224,10 +224,10 @@ int perf_trace_init(struct perf_event *p_event) list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && tp_event->class && tp_event->class->reg && - try_module_get(tp_event->mod)) { + trace_event_try_get_ref(tp_event)) { ret = perf_trace_event_init(tp_event, p_event); if (ret) - module_put(tp_event->mod); + trace_event_put_ref(tp_event); break; } } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 80e96989770e..1349b6de5eeb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2525,7 +2525,10 @@ __register_event(struct trace_event_call *call, struct module *mod) return ret; list_add(&call->list, &ftrace_events); - call->mod = mod; + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + atomic_set(&call->refcnt, 0); + else + call->module = mod; return 0; } @@ -2839,7 +2842,9 @@ static void trace_module_remove_events(struct module *mod) down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { - if (call->mod == mod) + if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module) + continue; + if (call->module == mod) __trace_remove_event_call(call); } up_write(&trace_event_sem); @@ -2982,7 +2987,7 @@ struct trace_event_file *trace_get_event_file(const char *instance, } /* Don't let event modules unload while in use */ - ret = try_module_get(file->event_call->mod); + ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); ret = -EBUSY; @@ -3012,7 +3017,7 @@ EXPORT_SYMBOL_GPL(trace_get_event_file); void trace_put_event_file(struct trace_event_file *file) { mutex_lock(&event_mutex); - module_put(file->event_call->mod); + trace_event_put_ref(file->event_call); mutex_unlock(&event_mutex); trace_array_put(file->tr); @@ -3147,7 +3152,7 @@ static int free_probe_data(void *data) if (!edata->ref) { /* Remove the SOFT_MODE flag */ __ftrace_event_enable_disable(edata->file, 0, 1); - module_put(edata->file->event_call->mod); + trace_event_put_ref(edata->file->event_call); kfree(edata); } return 0; @@ -3280,7 +3285,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, out_reg: /* Don't let event modules unload while probe registered */ - ret = try_module_get(file->event_call->mod); + ret = trace_event_try_get_ref(file->event_call); if (!ret) { ret = -EBUSY; goto out_free; @@ -3310,7 +3315,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: - module_put(file->event_call->mod); + trace_event_put_ref(file->event_call); out_free: kfree(data); goto out; @@ -3376,7 +3381,8 @@ void __trace_early_add_events(struct trace_array *tr) list_for_each_entry(call, &ftrace_events, list) { /* Early boot up should not have any modules loaded */ - if (WARN_ON_ONCE(call->mod)) + if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) && + WARN_ON_ONCE(call->module)) continue; ret = __trace_early_add_new_event(call, tr); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index f4f5489e1e28..d54094b7a9d7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1369,13 +1369,15 @@ static int destroy_synth_event(struct synth_event *se) int ret; if (se->ref) - ret = -EBUSY; - else { - ret = unregister_synth_event(se); - if (!ret) { - dyn_event_remove(&se->devent); - free_synth_event(se); - } + return -EBUSY; + + if (trace_event_dyn_busy(&se->call)) + return -EBUSY; + + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); } return ret; @@ -2102,6 +2104,9 @@ static int synth_event_release(struct dyn_event *ev) if (event->ref) return -EBUSY; + if (trace_event_dyn_busy(&event->call)) + return -EBUSY; + ret = unregister_synth_event(event); if (ret) return ret; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cf84d0f6583a..6b11e335a62e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1334,7 +1334,7 @@ void event_enable_trigger_free(struct event_trigger_ops *ops, if (!data->ref) { /* Remove the SOFT_MODE flag */ trace_event_enable_disable(enable_data->file, 0, 1); - module_put(enable_data->file->event_call->mod); + trace_event_put_ref(enable_data->file->event_call); trigger_data_free(data); kfree(enable_data); } @@ -1481,7 +1481,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_reg: /* Don't let event modules unload while probe registered */ - ret = try_module_get(event_enable_file->event_call->mod); + ret = trace_event_try_get_ref(event_enable_file->event_call); if (!ret) { ret = -EBUSY; goto out_free; @@ -1510,7 +1510,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_disable: trace_event_enable_disable(event_enable_file, 0, 1); out_put: - module_put(event_enable_file->event_call->mod); + trace_event_put_ref(event_enable_file->event_call); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bfef43bfce37..82c3b86013b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -543,6 +543,10 @@ static int unregister_trace_kprobe(struct trace_kprobe *tk) if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tk->tp))) + return -EBUSY; + /* Will fail if probe is being used by ftrace or perf */ if (unregister_kprobe_event(tk)) return -EBUSY; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 50eca53b8d22..1e2a92e7607d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -393,6 +393,10 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) if (trace_probe_has_sibling(&tu->tp)) goto unreg; + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp))) + return -EBUSY; + ret = unregister_uprobe_event(tu); if (ret) return ret; -- cgit v1.2.3 From fcd9db51df8e219e3a61b14e9b8c5ee67d39d37c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:58 -0400 Subject: tracing/probe: Have traceprobe_parse_probe_arg() take a const arg The two places that call traceprobe_parse_probe_arg() allocate a temporary buffer to copy the argv[i] into, because argv[i] is constant and the traceprobe_parse_probe_arg() will modify it to do the parsing. These two places allocate this buffer and then free it right after calling this function, leaving the onus of this allocation to the caller. As there's about to be a third user of this function that will have to do the same thing, instead of having the caller allocate the temporary buffer, simply move that allocation into the traceprobe_parse_probe_arg() itself, which will simplify the code of the callers. Link: https://lkml.kernel.org/r/20210817035027.385422828@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +-------- kernel/trace/trace_probe.c | 47 +++++++++++++++++++++++++++------------------ kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_uprobe.c | 9 +-------- 4 files changed, 31 insertions(+), 36 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 82c3b86013b2..ed1e3c2087ab 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -873,15 +873,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto error; - } - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); - kfree(tmp); + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], flags); if (ret) goto error; /* This can be -ENOMEM */ } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 15413ad7cef2..ef717b373443 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -540,26 +540,34 @@ static int __parse_bitfield_probe_arg(const char *bf, } /* String length checking wrapper */ -static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, +static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, struct probe_arg *parg, unsigned int flags, int offset) { struct fetch_insn *code, *scode, *tmp = NULL; char *t, *t2, *t3; + char *arg; int ret, len; + arg = kstrdup(argv, GFP_KERNEL); + if (!arg) + return -ENOMEM; + + ret = -EINVAL; len = strlen(arg); if (len > MAX_ARGSTR_LEN) { trace_probe_log_err(offset, ARG_TOO_LONG); - return -EINVAL; + goto out; } else if (len == 0) { trace_probe_log_err(offset, NO_ARG_BODY); - return -EINVAL; + goto out; } + ret = -ENOMEM; parg->comm = kstrdup(arg, GFP_KERNEL); if (!parg->comm) - return -ENOMEM; + goto out; + ret = -EINVAL; t = strchr(arg, ':'); if (t) { *t = '\0'; @@ -571,22 +579,22 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, offset += t2 + strlen(t2) - arg; trace_probe_log_err(offset, ARRAY_NO_CLOSE); - return -EINVAL; + goto out; } else if (t3[1] != '\0') { trace_probe_log_err(offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); - return -EINVAL; + goto out; } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { trace_probe_log_err(offset + t2 - arg, BAD_ARRAY_NUM); - return -EINVAL; + goto out; } if (parg->count > MAX_ARRAY_LEN) { trace_probe_log_err(offset + t2 - arg, ARRAY_TOO_BIG); - return -EINVAL; + goto out; } } } @@ -598,29 +606,30 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { /* The type of $comm must be "string", and not an array. */ if (parg->count || (t && strcmp(t, "string"))) - return -EINVAL; + goto out; parg->type = find_fetch_type("string"); } else parg->type = find_fetch_type(t); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); - return -EINVAL; + goto out; } parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + ret = -ENOMEM; if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); if (!parg->fmt) - return -ENOMEM; + goto out; snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, parg->count); } code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) - return -ENOMEM; + goto out; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], @@ -628,6 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (ret) goto fail; + ret = -EINVAL; /* Store operation */ if (!strcmp(parg->type->name, "string") || !strcmp(parg->type->name, "ustring")) { @@ -636,7 +646,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code->op != FETCH_OP_DATA) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); - ret = -EINVAL; goto fail; } if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || @@ -650,7 +659,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } } @@ -672,7 +680,6 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } code->op = FETCH_OP_ST_RAW; @@ -687,6 +694,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, goto fail; } } + ret = -EINVAL; /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && @@ -694,13 +702,11 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, scode->op != FETCH_OP_ST_USTRING) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); - ret = -EINVAL; goto fail; } code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(offset, TOO_MANY_OPS); - ret = -EINVAL; goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -709,6 +715,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code++; code->op = FETCH_OP_END; + ret = 0; /* Shrink down the code buffer */ parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) @@ -724,6 +731,8 @@ fail: kfree(code->data); } kfree(tmp); +out: + kfree(arg); return ret; } @@ -745,11 +754,11 @@ static int traceprobe_conflict_field_name(const char *name, return 0; } -int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, +int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, unsigned int flags) { struct probe_arg *parg = &tp->args[i]; - char *body; + const char *body; /* Increment count for freeing args in error case */ tp->nr_args++; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 227d518e5ba5..42aa084902fa 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -354,7 +354,7 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_MASK GENMASK(2, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, - char *arg, unsigned int flags); + const char *argv, unsigned int flags); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 1e2a92e7607d..93ff96541971 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -684,16 +684,9 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { - tmp = kstrdup(argv[i], GFP_KERNEL); - if (!tmp) { - ret = -ENOMEM; - goto error; - } - trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], is_return ? TPARG_FL_RETURN : 0); - kfree(tmp); if (ret) goto error; } -- cgit v1.2.3 From bc1b973455fd5d84dac4a094da44202f2d8a98ef Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:42:59 -0400 Subject: tracing/probes: Allow for dot delimiter as well as slash for system names Kprobe and uprobe events can add a "system" to the events that are created via the kprobe_events and uprobe_events files respectively. If they do not include a "system" in the name, then the default "kprobes" or "uprobes" is used. The current notation to specify a system for one of these probe events is to add a '/' delimiter in the name, where the content before the '/' will be the system to use, and the content after will be the event name. echo 'p:my_system/my_event' > kprobe_events But this is inconsistent with the way histogram triggers separate their system / event names. The histogram triggers use a '.' delimiter, which can be confusing. To allow this to be more consistent, as well as keep backward compatibility, allow the kprobe and uprobe events to denote a system name with either a '/' or a '.'. That is: echo 'p:my_system/my_event' > kprobe_events is equivalent to: echo 'p:my_system.my_event' > kprobe_events Link: https://lore.kernel.org/linux-trace-devel/20210813004448.51c7de69ce432d338f4d226b@kernel.org/ Link: https://lkml.kernel.org/r/20210817035027.580493202@goodmis.org Suggested-by: Masami Hiramatsu Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ef717b373443..0916a9964719 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -233,6 +233,9 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, int len; slash = strchr(event, '/'); + if (!slash) + slash = strchr(event, '.'); + if (slash) { if (slash == event) { trace_probe_log_err(offset, NO_GROUP_NAME); -- cgit v1.2.3 From 845cbf3e11acc263ec7a46a89097d88e7e50a9ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Aug 2021 23:43:00 -0400 Subject: tracing/probes: Use struct_size() instead of defining custom macros Remove SIZEOF_TRACE_KPROBE() and SIZEOF_TRACE_UPROBE() and use struct_size() as that's what it is made for. No need to have custom macros. Especially since struct_size() has some extra memory checks for correctness. Link: https://lkml.kernel.org/r/20210817035027.795000217@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +----- kernel/trace/trace_uprobe.c | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ed1e3c2087ab..ca726c9d0859 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -80,10 +80,6 @@ static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev) for_each_dyn_event(dpos) \ if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos))) -#define SIZEOF_TRACE_KPROBE(n) \ - (offsetof(struct trace_kprobe, tp.args) + \ - (sizeof(struct probe_arg) * (n))) - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -265,7 +261,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, struct trace_kprobe *tk; int ret = -ENOMEM; - tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL); if (!tk) return ERR_PTR(ret); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 93ff96541971..590bb9a02f8d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -83,10 +83,6 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) for_each_dyn_event(dpos) \ if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos))) -#define SIZEOF_TRACE_UPROBE(n) \ - (offsetof(struct trace_uprobe, tp.args) + \ - (sizeof(struct probe_arg) * (n))) - static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); @@ -340,7 +336,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) struct trace_uprobe *tu; int ret; - tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); + tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL); if (!tu) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 007517a01995fb24f2f4effc9cf34814361a9d10 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 00:13:27 -0400 Subject: tracing/probe: Change traceprobe_set_print_fmt() to take a type Instead of a boolean "is_return" have traceprobe_set_print_fmt() take a type (currently just PROBE_PRINT_NORMAL and PROBE_PRINT_RETURN). This will simplify adding different types. For example, the development of the event_probe, will need its own type as it prints an event, and not an IP. Link: https://lkml.kernel.org/r/20210819041842.104626301@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +++++++-- kernel/trace/trace_probe.c | 18 ++++++++++++------ kernel/trace/trace_probe.h | 7 ++++++- kernel/trace/trace_uprobe.c | 8 ++++++-- 4 files changed, 31 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ca726c9d0859..c6fe7a6e3f35 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -742,6 +742,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) bool is_return = false; char *symbol = NULL, *tmp = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + enum probe_print_type ptype; int maxactive = 0; long offset = 0; void *addr = NULL; @@ -875,7 +876,8 @@ static int __trace_kprobe_create(int argc, const char *argv[]) goto error; /* This can be -ENOMEM */ } - ret = traceprobe_set_print_fmt(&tk->tp, is_return); + ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tk->tp, ptype); if (ret < 0) goto error; @@ -1799,6 +1801,7 @@ struct trace_event_call * create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return) { + enum probe_print_type ptype; struct trace_kprobe *tk; int ret; char *event; @@ -1822,7 +1825,9 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, init_trace_event_call(tk); - if (traceprobe_set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) { + ptype = trace_kprobe_is_return(tk) ? + PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) { ret = -ENOMEM; goto error; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 0916a9964719..9c9c83a063b2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -851,19 +851,25 @@ int traceprobe_update_arg(struct probe_arg *arg) /* When len=0, we just calculate the needed length */ #define LEN_OR_ZERO (len ? len - pos : 0) static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, - bool is_return) + enum probe_print_type ptype) { struct probe_arg *parg; int i, j; int pos = 0; const char *fmt, *arg; - if (!is_return) { + switch (ptype) { + case PROBE_PRINT_NORMAL: fmt = "(%lx)"; arg = "REC->" FIELD_STRING_IP; - } else { + break; + case PROBE_PRINT_RETURN: fmt = "(%lx <- %lx)"; arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + break; + default: + WARN_ON_ONCE(1); + return 0; } pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); @@ -912,20 +918,20 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, } #undef LEN_OR_ZERO -int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return) +int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype) { struct trace_event_call *call = trace_probe_event_call(tp); int len; char *print_fmt; /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0, is_return); + len = __set_print_fmt(tp, NULL, 0, ptype); print_fmt = kmalloc(len + 1, GFP_KERNEL); if (!print_fmt) return -ENOMEM; /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1, is_return); + __set_print_fmt(tp, print_fmt, len + 1, ptype); call->print_fmt = print_fmt; return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 42aa084902fa..8adf5f3542a6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -363,7 +363,12 @@ extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset); -extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); +enum probe_print_type { + PROBE_PRINT_NORMAL, + PROBE_PRINT_RETURN, +}; + +extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); #ifdef CONFIG_PERF_EVENTS extern struct trace_event_call * diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 590bb9a02f8d..09f8ca7f7ba0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -536,6 +536,7 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; bool is_return = false; @@ -687,7 +688,8 @@ static int __trace_uprobe_create(int argc, const char **argv) goto error; } - ret = traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)); + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tu->tp, ptype); if (ret < 0) goto error; @@ -1578,6 +1580,7 @@ struct trace_event_call * create_local_trace_uprobe(char *name, unsigned long offs, unsigned long ref_ctr_offset, bool is_return) { + enum probe_print_type ptype; struct trace_uprobe *tu; struct path path; int ret; @@ -1612,7 +1615,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->filename = kstrdup(name, GFP_KERNEL); init_trace_event_call(tu); - if (traceprobe_set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) { + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) { ret = -ENOMEM; goto error; } -- cgit v1.2.3 From 8565a45d0858078b63c7d84074a21a42ba9ebf01 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 19 Aug 2021 00:13:28 -0400 Subject: tracing/probes: Have process_fetch_insn() take a void * instead of pt_regs In preparation to allow event probes to use the process_fetch_insn() callback in trace_probe_tmpl.h, change the data passed to it from a pointer to pt_regs, as the event probe will not be using regs, and make it a void pointer instead. Update the process_fetch_insn() callers for kprobe and uprobe events to have the regs defined in the function and just typecast the void pointer parameter. Link: https://lkml.kernel.org/r/20210819041842.291622924@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 3 ++- kernel/trace/trace_probe_tmpl.h | 6 +++--- kernel/trace/trace_uprobe.c | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c6fe7a6e3f35..4b013d24f5a9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1325,9 +1325,10 @@ probe_mem_read(void *dest, void *src, size_t size) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { + struct pt_regs *regs = rec; unsigned long val; retry: diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index f003c5d02a3a..b3bdb8ddb862 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf) * If dest is NULL, don't store result and return required dynamic data size. */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base); static nokprobe_inline int fetch_store_strlen(unsigned long addr); static nokprobe_inline int @@ -188,7 +188,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) /* Store the value of each argument */ static nokprobe_inline void -store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, +store_trace_args(void *data, struct trace_probe *tp, void *rec, int header_size, int maxlen) { struct probe_arg *arg; @@ -203,7 +203,7 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, /* Point the dynamic data area if needed */ if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); - ret = process_fetch_insn(arg->code, regs, dl, base); + ret = process_fetch_insn(arg->code, rec, dl, base); if (unlikely(ret < 0 && arg->dynamic)) { *dl = make_data_loc(0, dyndata - base); } else { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 09f8ca7f7ba0..d219ba50efbd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -213,9 +213,10 @@ static unsigned long translate_user_vaddr(unsigned long file_offset) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, void *base) { + struct pt_regs *regs = rec; unsigned long val; /* 1st stage: get value from context */ -- cgit v1.2.3 From 8e242060c6a4947e8ae7d29794af6a581db08841 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 19 Aug 2021 19:26:02 +0900 Subject: tracing/probes: Reject events which have the same name of existing one Since kprobe_events and uprobe_events only check whether the other same-type probe event has the same name or not, if the user gives the same name of the existing tracepoint event (or the other type of probe events), it silently fails to create the tracefs entry (but registered.) as below. /sys/kernel/tracing # ls events/task/task_rename enable filter format hist id trigger /sys/kernel/tracing # echo p:task/task_rename vfs_read >> kprobe_events [ 113.048508] Could not create tracefs 'task_rename' directory /sys/kernel/tracing # cat kprobe_events p:task/task_rename vfs_read To fix this issue, check whether the existing events have the same name or not in trace_probe_register_event_call(). If exists, it rejects to register the new event. Link: https://lkml.kernel.org/r/162936876189.187130.17558311387542061930.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +++++- kernel/trace/trace_probe.c | 25 +++++++++++++++++++++++++ kernel/trace/trace_probe.h | 1 + kernel/trace/trace_uprobe.c | 6 +++++- 4 files changed, 36 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 4b013d24f5a9..882c27044029 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -647,7 +647,11 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9c9c83a063b2..782c00eb6859 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1047,11 +1047,36 @@ error: return ret; } +static struct trace_event_call * +find_trace_event_call(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + return tp_event; + } + + return NULL; +} + int trace_probe_register_event_call(struct trace_probe *tp) { struct trace_event_call *call = trace_probe_event_call(tp); int ret; + lockdep_assert_held(&event_mutex); + + if (find_trace_event_call(trace_probe_group_name(tp), + trace_probe_name(tp))) + return -EEXIST; + ret = register_trace_event(&call->event); if (!ret) return -ENODEV; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8adf5f3542a6..66701a92d186 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -404,6 +404,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_EVENT_NAME, "Event name is not specified"), \ C(EVENT_TOO_LONG, "Event name is too long"), \ C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(EVENT_EXIST, "Given group/event name is already used by another event"), \ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ C(BAD_STACK_NUM, "Invalid stack number"), \ C(BAD_ARG_NUM, "Invalid argument number"), \ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d219ba50efbd..225ce569bf8f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -515,7 +515,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warn("Failed to register probe event(%d)\n", ret); + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } -- cgit v1.2.3 From 7491e2c442781a1860181adb5ab472a52075f393 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Thu, 19 Aug 2021 11:26:06 -0400 Subject: tracing: Add a probe that attaches to trace events A new dynamic event is introduced: event probe. The event is attached to an existing tracepoint and uses its fields as arguments. The user can specify custom format string of the new event, select what tracepoint arguments will be printed and how to print them. An event probe is created by writing configuration string in 'dynamic_events' ftrace file: e[:[SNAME/]ENAME] SYSTEM/EVENT [FETCHARGS] - Set an event probe -:SNAME/ENAME - Delete an event probe Where: SNAME - System name, if omitted 'eprobes' is used. ENAME - Name of the new event in SNAME, if omitted the SYSTEM_EVENT is used. SYSTEM - Name of the system, where the tracepoint is defined, mandatory. EVENT - Name of the tracepoint event in SYSTEM, mandatory. FETCHARGS - Arguments: =$[:TYPE] - Fetch given filed of the tracepoint and print it as given TYPE with given name. Supported types are: (u8/u16/u32/u64/s8/s16/s32/s64), basic type (x8/x16/x32/x64), hexadecimal types "string", "ustring" and bitfield. Example, attach an event probe on openat system call and print name of the file that will be opened: echo "e:esys/eopen syscalls/sys_enter_openat file=\$filename:string" >> dynamic_events A new dynamic event is created in events/esys/eopen/ directory. It can be deleted with: echo "-:esys/eopen" >> dynamic_events Filters, triggers and histograms can be attached to the new event, it can be matched in synthetic events. There is one limitation - an event probe can not be attached to kprobe, uprobe or another event probe. Link: https://lkml.kernel.org/r/20210812145805.2292326-1-tz.stoyanov@gmail.com Link: https://lkml.kernel.org/r/20210819152825.142428383@goodmis.org Acked-by: Masami Hiramatsu Co-developed-by: Steven Rostedt (VMware) Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Makefile | 1 + kernel/trace/trace.c | 5 +- kernel/trace/trace.h | 18 + kernel/trace/trace_eprobe.c | 903 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 14 +- kernel/trace/trace_kprobe.c | 8 - kernel/trace/trace_probe.c | 16 +- kernel/trace/trace_probe.h | 6 +- 8 files changed, 958 insertions(+), 13 deletions(-) create mode 100644 kernel/trace/trace_eprobe.c (limited to 'kernel/trace') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b1c47ccf4f73..6de5d4d63165 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -77,6 +77,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8425c3d70895..489924cde4f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5543,6 +5543,7 @@ static const char readme_msg[] = #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif + "\t e[:[/]] . []\n" "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" @@ -5552,7 +5553,7 @@ static const char readme_msg[] = " place (uprobe): :[%return][(ref_ctr_offset)]\n" #endif "\t args: =fetcharg[:type]\n" - "\t fetcharg: %, @
, @[+|-],\n" + "\t fetcharg: (%|$), @
, @[+|-],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API "\t $stack, $stack, $retval, $comm, $arg,\n" #else @@ -5567,6 +5568,8 @@ static const char readme_msg[] = "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" "\t [unsigned] char/int/long\n" #endif + "\t efield: For event probes ('e' types), the field is on of the fields\n" + "\t of the /.\n" #endif " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a0e693000c6..b7c0f8e160fb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -126,6 +126,11 @@ struct kprobe_trace_entry_head { unsigned long ip; }; +struct eprobe_trace_entry_head { + struct trace_entry ent; + unsigned int type; +}; + struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; @@ -1508,9 +1513,14 @@ static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); +enum { + EVENT_TRIGGER_FL_PROBE = BIT(0), +}; + struct event_trigger_data { unsigned long count; int ref; + int flags; struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; @@ -1918,6 +1928,14 @@ static inline bool is_good_name(const char *name) return true; } +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + /* * This is a generic way to read and write a u64 value from a file in tracefs. * diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c new file mode 100644 index 000000000000..56a96e9750cf --- /dev/null +++ b/kernel/trace/trace_eprobe.c @@ -0,0 +1,903 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * event probes + * + * Part of this code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu + * + * Copyright (C) 2021, VMware Inc, Steven Rostedt + * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> + * + */ +#include +#include +#include + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_tmpl.h" + +#define EPROBE_EVENT_SYSTEM "eprobes" + +struct trace_eprobe { + /* tracepoint system */ + const char *event_system; + + /* tracepoint event */ + const char *event_name; + + struct trace_event_call *event; + + struct dyn_event devent; + struct trace_probe tp; +}; + +struct eprobe_data { + struct trace_event_file *file; + struct trace_eprobe *ep; +}; + +static int __trace_eprobe_create(int argc, const char *argv[]); + +static void trace_event_probe_cleanup(struct trace_eprobe *ep) +{ + if (!ep) + return; + trace_probe_cleanup(&ep->tp); + kfree(ep->event_name); + kfree(ep->event_system); + if (ep->event) + trace_event_put_ref(ep->event); + kfree(ep); +} + +static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_eprobe, devent); +} + +static int eprobe_dyn_event_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_eprobe_create); +} + +static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int i; + + seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp), + trace_probe_name(&ep->tp)); + seq_printf(m, " %s.%s", ep->event_system, ep->event_name); + + for (i = 0; i < ep->tp.nr_args; i++) + seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +static int unregister_trace_eprobe(struct trace_eprobe *ep) +{ + /* If other probes are on the event, just unregister eprobe */ + if (trace_probe_has_sibling(&ep->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&ep->tp)) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (trace_probe_unregister_event_call(&ep->tp)) + return -EBUSY; + +unreg: + dyn_event_remove(&ep->devent); + trace_probe_unlink(&ep->tp); + + return 0; +} + +static int eprobe_dyn_event_release(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int ret = unregister_trace_eprobe(ep); + + if (!ret) + trace_event_probe_cleanup(ep); + return ret; +} + +static bool eprobe_dyn_event_is_busy(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return trace_probe_is_enabled(&ep->tp); +} + +static bool eprobe_dyn_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return strcmp(trace_probe_name(&ep->tp), event) == 0 && + (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) && + trace_probe_match_command_args(&ep->tp, argc, argv); +} + +static struct dyn_event_operations eprobe_dyn_event_ops = { + .create = eprobe_dyn_event_create, + .show = eprobe_dyn_event_show, + .is_busy = eprobe_dyn_event_is_busy, + .free = eprobe_dyn_event_release, + .match = eprobe_dyn_event_match, +}; + +static struct trace_eprobe *alloc_event_probe(const char *group, + const char *this_event, + struct trace_event_call *event, + int nargs) +{ + struct trace_eprobe *ep; + const char *event_name; + const char *sys_name; + int ret = -ENOMEM; + + if (!event) + return ERR_PTR(-ENODEV); + + sys_name = event->class->system; + event_name = trace_event_name(event); + + ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); + if (!ep) { + trace_event_put_ref(ep->event); + goto error; + } + ep->event = event; + ep->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ep->event_name) + goto error; + ep->event_system = kstrdup(sys_name, GFP_KERNEL); + if (!ep->event_system) + goto error; + + ret = trace_probe_init(&ep->tp, this_event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); + return ep; +error: + trace_event_probe_cleanup(ep); + return ERR_PTR(ret); +} + +static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i) +{ + struct probe_arg *parg = &ep->tp.args[i]; + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ep->event); + list_for_each_entry(field, head, link) { + if (!strcmp(parg->code->data, field->name)) { + kfree(parg->code->data); + parg->code->data = field; + return 0; + } + } + kfree(parg->code->data); + parg->code->data = NULL; + return -ENOENT; +} + +static int eprobe_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct eprobe_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_fields eprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = eprobe_event_define_fields }, + {} +}; + +/* Event entry printers */ +static enum print_line_t +print_eprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct eprobe_trace_entry_head *field; + struct trace_event_call *pevent; + struct trace_event *probed_event; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct eprobe_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + probed_event = ftrace_find_event(field->type); + if (probed_event) { + pevent = container_of(probed_event, struct trace_event_call, event); + trace_seq_printf(s, "%s.%s", pevent->class->system, + trace_event_name(pevent)); + } else { + trace_seq_printf(s, "%u", field->type); + } + + trace_seq_putc(s, ')'); + + if (print_probe_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static unsigned long get_event_field(struct fetch_insn *code, void *rec) +{ + struct ftrace_event_field *field = code->data; + unsigned long val; + void *addr; + + addr = rec + field->offset; + + switch (field->size) { + case 1: + if (field->is_signed) + val = *(char *)addr; + else + val = *(unsigned char *)addr; + break; + case 2: + if (field->is_signed) + val = *(short *)addr; + else + val = *(unsigned short *)addr; + break; + case 4: + if (field->is_signed) + val = *(int *)addr; + else + val = *(unsigned int *)addr; + break; + default: + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + return val; +} + +static int get_eprobe_size(struct trace_probe *tp, void *rec) +{ + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + unsigned long val; + + val = get_event_field(arg->code, rec); + len = process_fetch_insn_bottom(arg->code + 1, val, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Kprobe specific fetch functions */ + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + unsigned long val; + + val = get_event_field(code, rec); + return process_fetch_insn_bottom(code + 1, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + return (ret < 0) ? ret : len; +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); + + return ret; +} + +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + const void __user *uaddr = (__force const void __user *)src; + + return copy_from_user_nofault(dest, uaddr, size); +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return copy_from_kernel_nofault(dest, src, size); +} + +/* eprobe handler */ +static inline void +__eprobe_trace_func(struct eprobe_data *edata, void *rec) +{ + struct eprobe_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != edata->file->event_call)) + return; + + if (trace_trigger_soft_disabled(edata->file)) + return; + + fbuffer.trace_ctx = tracing_gen_ctx(); + fbuffer.trace_file = edata->file; + + dsize = get_eprobe_size(&edata->ep->tp, rec); + fbuffer.regs = NULL; + + fbuffer.event = + trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file, + call->event.type, + sizeof(*entry) + edata->ep->tp.size + dsize, + fbuffer.trace_ctx); + if (!fbuffer.event) + return; + + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + if (edata->ep->event) + entry->type = edata->ep->event->event.type; + else + entry->type = 0; + store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +/* + * The event probe implementation uses event triggers to get access to + * the event it is attached to, but is not an actual trigger. The below + * functions are just stubs to fulfill what is needed to use the trigger + * infrastructure. + */ +static int eprobe_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return 0; +} + +static void eprobe_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + +} + +static int eprobe_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + /* Do not print eprobe event triggers */ + return 0; +} + +static void eprobe_trigger_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe) +{ + struct eprobe_data *edata = data->private_data; + + __eprobe_trace_func(edata, rec); +} + +static struct event_trigger_ops eprobe_trigger_ops = { + .func = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, +}; + +static int eprobe_trigger_cmd_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) +{ + return -1; +} + +static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + return -1; +} + +static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + +} + +static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) +{ + return &eprobe_trigger_ops; +} + +static struct event_command event_trigger_cmd = { + .name = "eprobe", + .trigger_type = ETT_EVENT_EPROBE, + .flags = EVENT_CMD_FL_NEEDS_REC, + .func = eprobe_trigger_cmd_func, + .reg = eprobe_trigger_reg_func, + .unreg = eprobe_trigger_unreg_func, + .unreg_all = NULL, + .get_trigger_ops = eprobe_trigger_get_ops, + .set_filter = NULL, +}; + +static struct event_trigger_data * +new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) +{ + struct event_trigger_data *trigger; + struct eprobe_data *edata; + + edata = kzalloc(sizeof(*edata), GFP_KERNEL); + trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); + if (!trigger || !edata) { + kfree(edata); + kfree(trigger); + return ERR_PTR(-ENOMEM); + } + + trigger->flags = EVENT_TRIGGER_FL_PROBE; + trigger->count = -1; + trigger->ops = &eprobe_trigger_ops; + + /* + * EVENT PROBE triggers are not registered as commands with + * register_event_command(), as they are not controlled by the user + * from the trigger file + */ + trigger->cmd_ops = &event_trigger_cmd; + + INIT_LIST_HEAD(&trigger->list); + RCU_INIT_POINTER(trigger->filter, NULL); + + edata->file = file; + edata->ep = ep; + trigger->private_data = edata; + + return trigger; +} + +static int enable_eprobe(struct trace_eprobe *ep, + struct trace_event_file *eprobe_file) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct trace_array *tr = eprobe_file->tr; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + trigger = new_eprobe_trigger(ep, eprobe_file); + if (IS_ERR(trigger)) + return PTR_ERR(trigger); + + list_add_tail_rcu(&trigger->list, &file->triggers); + + trace_event_trigger_enable_disable(file, 1); + update_cond_flag(file); + + return 0; +} + +static struct trace_event_functions eprobe_funcs = { + .trace = print_eprobe_event +}; + +static int disable_eprobe(struct trace_eprobe *ep, + struct trace_array *tr) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct eprobe_data *edata; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + + list_for_each_entry(trigger, &file->triggers, list) { + if (!(trigger->flags & EVENT_TRIGGER_FL_PROBE)) + continue; + edata = trigger->private_data; + if (edata->ep == ep) + break; + } + if (list_entry_is_head(trigger, &file->triggers, list)) + return -ENODEV; + + list_del_rcu(&trigger->list); + + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + return 0; +} + +static int enable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *pos, *tp; + struct trace_eprobe *ep; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (enabled) + return 0; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + ret = enable_eprobe(ep, file); + if (ret) + break; + enabled = true; + } + + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) + disable_eprobe(ep, file->tr); + if (file) + trace_probe_remove_file(tp, file); + else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + } + + return ret; +} + +static int disable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *pos, *tp; + struct trace_eprobe *ep; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + ep = container_of(pos, struct trace_eprobe, tp); + disable_eprobe(ep, file->tr); + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +static int eprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_eprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_eprobe(event, file); +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static inline void init_trace_eprobe_call(struct trace_eprobe *ep) +{ + struct trace_event_call *call = trace_probe_event_call(&ep->tp); + + call->flags = TRACE_EVENT_FL_EPROBE; + call->event.funcs = &eprobe_funcs; + call->class->fields_array = eprobe_fields_array; + call->class->reg = eprobe_register; +} + +static struct trace_event_call * +find_and_get_event(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + /* Skip other probes and ftrace events */ + if (tp_event->flags & + (TRACE_EVENT_FL_IGNORE_ENABLE | + TRACE_EVENT_FL_KPROBE | + TRACE_EVENT_FL_UPROBE | + TRACE_EVENT_FL_EPROBE)) + continue; + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + if (!trace_event_try_get_ref(tp_event)) { + return NULL; + break; + } + return tp_event; + break; + } + return NULL; +} + +static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) +{ + unsigned int flags = TPARG_FL_KERNEL | TPARG_FL_TPOINT; + int ret; + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], flags); + if (ret) + return ret; + + if (ep->tp.args[i].code->op == FETCH_OP_TP_ARG) + ret = trace_eprobe_tp_arg_update(ep, i); + + return ret; +} + +static int __trace_eprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * Fetch args: + * =$[:TYPE] + */ + const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; + const char *sys_event = NULL, *sys_name = NULL; + struct trace_event_call *event_call; + struct trace_eprobe *ep = NULL; + char buf1[MAX_EVENT_NAME_LEN]; + char buf2[MAX_EVENT_NAME_LEN]; + int ret = 0; + int i; + + if (argc < 2 || argv[0][0] != 'e') + return -ECANCELED; + + trace_probe_log_init("event_probe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) { + event++; + ret = traceprobe_parse_event_name(&event, &group, buf1, + event - argv[0]); + if (ret) + goto parse_error; + } else { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } + if (!is_good_name(event) || !is_good_name(group)) + goto parse_error; + + sys_event = argv[1]; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, + sys_event - argv[1]); + if (ret || !sys_name) + goto parse_error; + if (!is_good_name(sys_event) || !is_good_name(sys_name)) + goto parse_error; + + mutex_lock(&event_mutex); + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + mutex_unlock(&event_mutex); + + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto error; /* We know ep is not allocated */ + } + + argc -= 2; argv += 2; + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ret = trace_eprobe_tp_update_arg(ep, argv, i); + if (ret) + goto error; + } + ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); + if (ret < 0) + goto error; + init_trace_eprobe_call(ep); + mutex_lock(&event_mutex); + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + mutex_unlock(&event_mutex); + goto error; + } + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + mutex_unlock(&event_mutex); + return ret; +parse_error: + ret = -EINVAL; +error: + trace_event_probe_cleanup(ep); + return ret; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup eprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_eprobe_init_early(void) +{ + int err = 0; + + err = dyn_event_register(&eprobe_dyn_event_ops); + if (err) + pr_warn("Could not register eprobe_dyn_event_ops\n"); + + return err; +} +core_initcall(trace_events_eprobe_init_early); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 6b11e335a62e..3d5c07239a2a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -124,6 +124,18 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) return seq_list_next(t, &event_file->triggers, pos); } +static bool check_user_trigger(struct trace_event_file *file) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->flags & EVENT_TRIGGER_FL_PROBE) + continue; + return true; + } + return false; +} + static void *trigger_start(struct seq_file *m, loff_t *pos) { struct trace_event_file *event_file; @@ -134,7 +146,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) if (unlikely(!event_file)) return ERR_PTR(-ENODEV); - if (list_empty(&event_file->triggers)) + if (list_empty(&event_file->triggers) || !check_user_trigger(event_file)) return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; return seq_list_start(&event_file->triggers, *pos); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 882c27044029..3a64ba4bbad6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -707,14 +707,6 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; -/* Convert certain expected symbols into '_' when generating event names */ -static inline void sanitize_event_name(char *name) -{ - while (*name++ != '\0') - if (*name == ':' || *name == '.') - *name = '_'; -} - static int __trace_kprobe_create(int argc, const char *argv[]) { /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 782c00eb6859..3ed2a3f37297 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -319,6 +319,13 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif + } else if (flags & TPARG_FL_TPOINT) { + if (code->data) + return -EFAULT; + code->data = kstrdup(arg, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + code->op = FETCH_OP_TP_ARG; } else goto inval_var; @@ -646,13 +653,14 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, !strcmp(parg->type->name, "ustring")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && - code->op != FETCH_OP_DATA) { + code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_STRING); goto fail; } if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || - code->op == FETCH_OP_DATA) || parg->count) { + code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || + parg->count) { /* * IMM, DATA and COMM is pointing actual address, those * must be kept, and if parg->count != 0, this is an @@ -867,6 +875,10 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, fmt = "(%lx <- %lx)"; arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; break; + case PROBE_PRINT_EVENT: + fmt = "(%u)"; + arg = "REC->" FIELD_STRING_TYPE; + break; default: WARN_ON_ONCE(1); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 66701a92d186..99e7a5df025e 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,6 +38,7 @@ #define FIELD_STRING_IP "__probe_ip" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" +#define FIELD_STRING_TYPE "__probe_type" #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ @@ -102,6 +103,7 @@ enum fetch_op { FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ + FETCH_OP_TP_ARG, /* Trace Point argument */ FETCH_OP_END, FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ }; @@ -351,7 +353,8 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) -#define TPARG_FL_MASK GENMASK(2, 0) +#define TPARG_FL_TPOINT BIT(3) +#define TPARG_FL_MASK GENMASK(3, 0) extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *argv, unsigned int flags); @@ -366,6 +369,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, enum probe_print_type { PROBE_PRINT_NORMAL, PROBE_PRINT_RETURN, + PROBE_PRINT_EVENT, }; extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); -- cgit v1.2.3