From 771e03842a9e98a1c2013ca1ed8bb2793488f3e5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 30 Nov 2012 10:41:57 -0500 Subject: ring-buffer: Remove unnecessary recusive call in rb_advance_iter() The original ring-buffer code had special checks at the start of rb_advance_iter() and instead of repeating them again at the end of the function if a certain condition existed, I just did a recursive call to rb_advance_iter() because the special condition would cause rb_advance_iter() to return early (after the checks). But as things have changed, the special checks no longer exist and the only thing done for the special_condition is to call rb_inc_iter() and return. Instead of doing a confusing recursive call, just call rb_inc_iter instead. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ce8514feedcd..6ff9cc4658ed 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3425,7 +3425,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) /* check for end of page padding */ if ((iter->head >= rb_page_size(iter->head_page)) && (iter->head_page != cpu_buffer->commit_page)) - rb_advance_iter(iter); + rb_inc_iter(iter); } static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) -- cgit v1.2.3 From d8a0349c0cea477322c66ea9362f10c62fad5f62 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Tue, 13 Nov 2012 09:53:04 +0800 Subject: tracing: Use this_cpu_ptr per-cpu helper typeof(&buffer) is a pointer to array of 1024 char, or char (*)[1024]. But, typeof(&buffer[0]) is a pointer to char which match the return type of get_trace_buf(). As well-known, the value of &buffer is equal to &buffer[0]. so return this_cpu_ptr(&percpu_buffer->buffer[0]) can avoid type cast. Link: http://lkml.kernel.org/r/50A1A800.3020102@gmail.com Reviewed-by: Christoph Lameter Signed-off-by: Shan Wei Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 2 +- kernel/trace/trace.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c0bd0308741c..71259e2b6b61 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) return; local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + buf = this_cpu_ptr(bt->msg_data); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c13e46d7d24..f8b7c626f3fd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1517,7 +1517,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer; static char *get_trace_buf(void) { struct trace_buffer_struct *percpu_buffer; - struct trace_buffer_struct *buffer; /* * If we have allocated per cpu buffers, then we do not @@ -1535,9 +1534,7 @@ static char *get_trace_buf(void) if (!percpu_buffer) return NULL; - buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); - - return buffer->buffer; + return this_cpu_ptr(&percpu_buffer->buffer[0]); } static int alloc_percpu_trace_buffer(void) -- cgit v1.2.3 From d24d7dbf3cc49b00a152e55e24f0eeb173c7a971 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 18 Jul 2012 18:16:44 +0800 Subject: tracing: Verify target file before registering a uprobe event Without this patch, we can register a uprobe event for a directory. Enabling such a uprobe event would anyway fail. Example: $ echo 'p /bin:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events However dirctories cannot be valid targets for uprobe. Hence verify if the target is a regular file during the probe registration. Link: http://lkml.kernel.org/r/20130103004212.690763002@goodmis.org Cc: Namhyung Kim Signed-off-by: Jovi Zhang Acked-by: Srikar Dronamraju [ cleaned up whitespace and removed redundant IS_DIR() check ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c86e6d4f67fb..87b6db4ccbc5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -258,6 +258,10 @@ static int create_trace_uprobe(int argc, char **argv) goto fail_address_parse; inode = igrab(path.dentry->d_inode); + if (!S_ISREG(inode->i_mode)) { + ret = -EINVAL; + goto fail_address_parse; + } argc -= 2; argv += 2; @@ -356,7 +360,7 @@ fail_address_parse: if (inode) iput(inode); - pr_info("Failed to parse address.\n"); + pr_info("Failed to parse address or file.\n"); return ret; } -- cgit v1.2.3 From 6aea49cb5f3001a8275bf9c9f586ec3eb39af194 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 21 Nov 2012 15:13:47 +0800 Subject: tracing/syscalls: Make local functions static Some functions in the syscall tracing is used only locally to the file, but they are labeled global. Convert them to static functions. Signed-off-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7609dd6714c2..5329e13e74a1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -77,7 +77,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } -enum print_line_t +static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -130,7 +130,7 @@ end: return TRACE_TYPE_HANDLED; } -enum print_line_t +static enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags, struct trace_event *event) { @@ -270,7 +270,7 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; @@ -305,7 +305,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -337,7 +337,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; @@ -356,7 +356,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; @@ -371,7 +371,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; @@ -390,7 +390,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) return ret; } -void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; @@ -459,7 +459,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -int __init init_ftrace_syscalls(void) +static int __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; -- cgit v1.2.3 From a54164114b96b4693b42cdb553260eec41ea4393 Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 19 Dec 2012 16:02:34 +0900 Subject: tracing: Add checks if tr->buffer is NULL in tracing_reset{_online_cpus} max_tr->buffer could be NULL in the tracing_reset{_online_cpus}. In this case, a NULL pointer dereference happens, so we should return immediately from these functions. Note, the current code does not call tracing_reset*() with max_tr when its buffer is NULL, but future code will. This patch is needed to prevent the future code from crashing. Link: http://lkml.kernel.org/r/20121219070234.31200.93863.stgit@liselsia Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8b7c626f3fd..72b171b90e55 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -922,6 +922,9 @@ void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ @@ -936,6 +939,9 @@ void tracing_reset_online_cpus(struct trace_array *tr) struct ring_buffer *buffer = tr->buffer; int cpu; + if (!buffer) + return; + ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ -- cgit v1.2.3 From 84c6cf0db6a00601eb43cfc08244a398ffb0894c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Dec 2012 21:43:52 -0500 Subject: tracing: Remove unneeded check of max_tr->buffer before tracing_reset There's now a check in tracing_reset_online_cpus() if the buffer is allocated or NULL. No need to do a check before calling it with max_tr. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 72b171b90e55..d62248dfda76 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4040,8 +4040,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, * Reset the buffer so that it doesn't have incomparable timestamps. */ tracing_reset_online_cpus(&global_trace); - if (max_tr.buffer) - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&max_tr); mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 8741db532e86da2e54f05be751bfe1922ca63d57 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 16 Jan 2013 10:49:37 -0500 Subject: tracing/fgraph: Add max_graph_depth to limit function_graph depth Add the file max_graph_depth to the debug tracing directory that lets the user define the depth of the function graph. A very useful operation is to set the depth to 1. Then it traces only the first function that is called when entering the kernel. This can be used to determine what system operations interrupt a process. For example, to work on NOHZ processes (single tasks running without a timer tick), if any interrupt goes off and preempts that task, this code will show it happening. # cd /sys/kernel/debug/tracing # echo 1 > max_graph_depth # echo function_graph > current_tracer # cat per_cpu/cpu//trace Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 60 ++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4edb4b74eb7e..7008d2e13cf2 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -47,6 +47,8 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_IRQS 0x40 +static unsigned int max_depth; + static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, @@ -250,8 +252,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) + if ((!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) || + (max_depth && trace->depth >= max_depth)) return 0; local_irq_save(flags); @@ -1457,6 +1460,59 @@ static struct tracer graph_trace __read_mostly = { #endif }; + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("max_graph_depth", 0644, d_tracer, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_debugfs); + static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); -- cgit v1.2.3 From 06aeaaeabf69da4a3e86df532425640f51b01cef Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Sep 2012 17:15:17 +0900 Subject: ftrace: Move ARCH_SUPPORTS_FTRACE_SAVE_REGS in Kconfig Move SAVE_REGS support flag into Kconfig and rename it to CONFIG_DYNAMIC_FTRACE_WITH_REGS. This also introduces CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS which indicates the architecture depending part of ftrace has a code that saves full registers. On the other hand, CONFIG_DYNAMIC_FTRACE_WITH_REGS indicates the code is enabled. Link: http://lkml.kernel.org/r/20120928081516.3560.72534.stgit@ltc138.sdl.hitachi.co.jp Cc: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ftrace.c | 6 +++--- kernel/trace/trace_selftest.c | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d89335a485f..cdc9d284d24e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE help See Documentation/trace/ftrace-design.txt +config HAVE_DYNAMIC_FTRACE_WITH_REGS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -434,6 +437,11 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 41473b4ad7a4..6e34dc162fe1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -337,7 +337,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; -#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. @@ -4143,8 +4143,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * Archs are to support both the regs and ftrace_ops at the same time. * If they support ftrace_ops, it is assumed they support regs. * If call backs want to use regs, they must either check for regs - * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. - * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still * set the ARCH_SUPPORT_FTARCE_OPS. */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 47623169a815..6c62d58d8e87 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -568,7 +568,7 @@ trace_selftest_function_regs(void) int ret; int supported = 0; -#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS supported = 1; #endif -- cgit v1.2.3 From b000c8065a92b0fe0e1694f41b2c8d8ba7b7b1ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Jan 2013 10:31:20 -0500 Subject: tracing: Remove the extra 4 bytes of padding in events Due to a userspace issue with PowerTop v2beta, which hardcoded the offset of event fields that it was using, it broke when we removed the Big Kernel Lock counter from the event header. (commit e6e1e2593 "tracing: Remove lock_depth from event entry") Because this broke userspace, it was determined that we must keep those 4 bytes around. (commit a3a4a5acd "Regression: partial revert "tracing: Remove lock_depth from event entry"") This unfortunately wastes space in the ring buffer. 4 bytes per event, where a lot of events are just 24 bytes. That's 16% of the buffer wasted. A million events will add 4 megs of white space into the buffer. It was later noticed that PowerTop v2beta could not work on systems where the kernel was 64 bit but the userspace was 32 bits. The reason was because the offsets are different between the two and the hard coded offset of one would not work with the other. With PowerTop v2 final, it implemented the same interface that both perf and trace-cmd use. That is, it reads the format file of the event to find the offsets of the fields it needs. This fixes the problem with running powertop on a 32 bit userspace running on a 64 bit kernel. It also no longer requires the 4 byte padding. As PowerTop v2 has been out for a while, and is included in all major distributions, it is time that we can safely remove the 4 bytes of padding. Users of PowerTop v2beta should upgrade to PowerTop v2 final. Cc: Linus Torvalds Acked-by: Arjan van de Ven Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 - kernel/trace/trace_events.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d62248dfda76..a387bd271e71 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1173,7 +1173,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 880073d0b946..57e9b284250c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, padding); return ret; } -- cgit v1.2.3 From 0a71e4c6d749d06f52e75a406fc9046924fcfcc1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 12:06:56 -0500 Subject: tracing: Remove trace.h header from trace_clock.c As trace_clock is used by other things besides tracing, and it does not require anything from trace.h, it is best not to include the header file in trace_clock.c. Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 394783531cbb..22b638b28e48 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -21,8 +21,6 @@ #include #include -#include "trace.h" - /* * trace_clock_local(): the simplest and least coherent tracing clock. * -- cgit v1.2.3 From 34600f0e9c33c9cd48ae87448205f51332b7d5a0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 13:35:11 -0500 Subject: tracing: Fix race with max_tr and changing tracers There's a race condition between the setting of a new tracer and the update of the max trace buffers (the swap). When a new tracer is added, it sets current_trace to nop_trace before disabling the old tracer. At this moment, if the old tracer uses update_max_tr(), the update may trigger the warning against !current_trace->use_max-tr, as nop_trace doesn't have that set. As update_max_tr() requires that interrupts be disabled, we can add a check to see if current_trace == nop_trace and bail if it does. Then when disabling the current_trace, set it to nop_trace and run synchronize_sched(). This will make sure all calls to update_max_tr() have completed (it was called with interrupts disabled). As a clean up, this commit also removes shrinking and recreating the max_tr buffer if the old and new tracers both have use_max_tr set. The old way use to always shrink the buffer, and then expand it for the next tracer. This is a waste of time. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a387bd271e71..d2a658349ca1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -709,10 +709,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + + /* If we disabled the tracer, stop now */ + if (current_trace == &nop_trace) return; - } + + if (WARN_ON_ONCE(!current_trace->use_max_tr)) + return; + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -3185,6 +3189,7 @@ static int tracing_set_tracer(const char *buf) static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; + bool had_max_tr; int ret = 0; mutex_lock(&trace_types_lock); @@ -3211,7 +3216,19 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - if (current_trace && current_trace->use_max_tr) { + + had_max_tr = current_trace && current_trace->use_max_tr; + current_trace = &nop_trace; + + if (had_max_tr && !t->use_max_tr) { + /* + * We need to make sure that the update_max_tr sees that + * current_trace changed to nop_trace to keep it from + * swapping the buffers after we resize it. + * The update_max_tr is called from interrupts disabled + * so a synchronized_sched() is sufficient. + */ + synchronize_sched(); /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and @@ -3222,10 +3239,8 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = &nop_trace; - topts = create_trace_option_files(t); - if (t->use_max_tr) { + if (t->use_max_tr && !had_max_tr) { /* we need to make per cpu buffer sizes equivalent */ ret = resize_buffer_duplicate_size(&max_tr, &global_trace, RING_BUFFER_ALL_CPUS); -- cgit v1.2.3 From 05cbbf643b8eea1be21082c53cdb856d1dc6d765 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 23:35:11 -0500 Subject: tracing: Fix selftest function recursion accounting The test that checks function recursion does things differently if the arch does not support all ftrace features. But that really doesn't make a difference with how the test runs, and either way the count variable should be 2 at the end. Currently the test wrongly fails for archs that don't support all the ftrace features. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 6c62d58d8e87..adb008a0136f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -452,7 +452,6 @@ trace_selftest_function_recursion(void) char *func_name; int len; int ret; - int cnt; /* The previous test PASSED */ pr_cont("PASSED\n"); @@ -510,19 +509,10 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_recsafe_probe); - /* - * If arch supports all ftrace features, and no other task - * was on the list, we should be fine. - */ - if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) - cnt = 2; /* Should have recursed */ - else - cnt = 1; - ret = -1; - if (trace_selftest_recursion_cnt != cnt) { - pr_cont("*callback not called expected %d times (%d)* ", - cnt, trace_selftest_recursion_cnt); + if (trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called expected 2 times (%d)* ", + trace_selftest_recursion_cnt); goto out; } -- cgit v1.2.3 From 6350379452ccaeaa71734adf57dec2ebc9207849 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 16:58:56 -0400 Subject: ftrace: Fix global function tracers that are not recursion safe If one of the function tracers set by the global ops is not recursion safe, it can still be called directly without the added recursion supplied by the ftrace infrastructure. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6e34dc162fe1..789cbec24e81 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -221,10 +221,24 @@ static void update_global_ops(void) * registered callers. */ if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) + ftrace_global_list->next == &ftrace_list_end) { func = ftrace_global_list->func; - else + /* + * As we are calling the function directly. + * If it does not have recursion protection, + * the function_trace_op needs to be updated + * accordingly. + */ + if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + else + global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; + } else { func = ftrace_global_list_func; + /* The list has its own recursion protection. */ + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; + } + /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { -- cgit v1.2.3 From 9640388b63556b4cfecbb5aaf91a5c99d272f429 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:01:20 -0400 Subject: ftrace: Fix function tracing recursion self test The function tracing recursion self test should not crash the machine if the resursion test fails. If it detects that the function tracing is recursing when it should not be, then bail, don't go into an infinite recursive loop. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index adb008a0136f..51c819c12c29 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip, * The ftrace infrastructure should provide the recursion * protection. If not, this will crash the kernel! */ - trace_selftest_recursion_cnt++; + if (trace_selftest_recursion_cnt++ > 10) + return; DYN_FTRACE_TEST_NAME(); } -- cgit v1.2.3 From 0a016409e42f273415f8225ddf2c58eb2df88034 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:03:03 -0400 Subject: ftrace: Optimize the function tracer list loop There is lots of places that perform: op = rcu_dereference_raw(ftrace_control_list); while (op != &ftrace_list_end) { Add a helper macro to do this, and also optimize for a single entity. That is, gcc will optimize a loop for either no iterations or more than one iteration. But usually only a single callback is registered to the function tracer, thus the optimized case should be a single pass. to do this we now do: op = rcu_dereference_raw(list); do { [...] } while (likely(op = rcu_dereference_raw((op)->next)) && unlikely((op) != &ftrace_list_end)); An op is always registered (ftrace_list_end when no callbacks is registered), thus when a single callback is registered, the link list looks like: top => callback => ftrace_list_end => NULL. The likely(op = op->next) still must be performed due to the race of removing the callback, where the first op assignment could equal ftrace_list_end. In that case, the op->next would be NULL. But this is unlikely (only happens in a race condition when removing the callback). But it is very likely that the next op would be ftrace_list_end, unless more than one callback has been registered. This tells gcc what the most common case is and makes the fast path with the least amount of branches. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 789cbec24e81..1330969d8447 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + /** * ftrace_nr_registered_ops - return number of ops registered * @@ -132,15 +152,6 @@ int ftrace_nr_registered_ops(void) return cnt; } -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) @@ -149,11 +160,9 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, return; trace_recursion_set(TRACE_GLOBAL_BIT); - op = rcu_dereference_raw(ftrace_global_list); /*see above*/ - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); /*see above*/ - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_GLOBAL_BIT); } @@ -4104,14 +4113,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); - op = rcu_dereference_raw(ftrace_control_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_control_list) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } @@ -4139,12 +4145,10 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); - op = rcu_dereference_raw(ftrace_ops_list); - while (op != &ftrace_list_end) { + do_for_each_ftrace_op(op, ftrace_ops_list) { if (ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); - op = rcu_dereference_raw(op->next); - }; + } while_for_each_ftrace_op(op); preempt_enable_notrace(); trace_recursion_clear(TRACE_INTERNAL_BIT); } -- cgit v1.2.3 From c29f122cd7fc178b72b1335b1fce0dff2e5c0f5d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:17:59 -0400 Subject: ftrace: Add context level recursion bit checking Currently for recursion checking in the function tracer, ftrace tests a task_struct bit to determine if the function tracer had recursed or not. If it has, then it will will return without going further. But this leads to races. If an interrupt came in after the bit was set, the functions being traced would see that bit set and think that the function tracer recursed on itself, and would return. Instead add a bit for each context (normal, softirq, irq and nmi). A check of which context the task is in is made before testing the associated bit. Now if an interrupt preempts the function tracer after the previous context has been set, the interrupt functions can still be traced. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 40 +++++++++++++++++++++++++++++++++------- kernel/trace/trace.h | 12 +++++++++--- 2 files changed, 42 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1330969d8447..639b6ab1f04c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -156,14 +156,27 @@ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = TRACE_GLOBAL_NMI_BIT; + + else if (in_irq()) + bit = TRACE_GLOBAL_IRQ_BIT; + else + bit = TRACE_GLOBAL_SIRQ_BIT; + } else + bit = TRACE_GLOBAL_BIT; + + if (unlikely(trace_recursion_test(bit))) return; - trace_recursion_set(TRACE_GLOBAL_BIT); + trace_recursion_set(bit); do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - trace_recursion_clear(TRACE_GLOBAL_BIT); + trace_recursion_clear(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -4132,14 +4145,27 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; + unsigned int bit; if (function_trace_stop) return; - if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) - return; + if (in_interrupt()) { + if (in_nmi()) + bit = TRACE_INTERNAL_NMI_BIT; + + else if (in_irq()) + bit = TRACE_INTERNAL_IRQ_BIT; + else + bit = TRACE_INTERNAL_SIRQ_BIT; + } else + bit = TRACE_INTERNAL_BIT; + + if (unlikely(trace_recursion_test(bit))) + return; + + trace_recursion_set(bit); - trace_recursion_set(TRACE_INTERNAL_BIT); /* * Some of the ops may be dynamically allocated, * they must be freed after a synchronize_sched(). @@ -4150,7 +4176,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(TRACE_INTERNAL_BIT); + trace_recursion_clear(bit); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c75d7988902c..fe6ccff9cc70 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -299,8 +299,14 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_GLOBAL_BIT (1<<12) -#define TRACE_CONTROL_BIT (1<<13) +#define TRACE_INTERNAL_NMI_BIT (1<<12) +#define TRACE_INTERNAL_IRQ_BIT (1<<13) +#define TRACE_INTERNAL_SIRQ_BIT (1<<14) +#define TRACE_GLOBAL_BIT (1<<15) +#define TRACE_GLOBAL_NMI_BIT (1<<16) +#define TRACE_GLOBAL_IRQ_BIT (1<<17) +#define TRACE_GLOBAL_SIRQ_BIT (1<<18) +#define TRACE_CONTROL_BIT (1<<19) /* * Abuse of the trace_recursion. @@ -309,7 +315,7 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<13) +#define TRACE_IRQ_BIT (1<<20) #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -- cgit v1.2.3 From e46cbf75c621725964fe1f6e7013e8bcd86a0e3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:32:25 -0400 Subject: tracing: Make the trace recursion bits into enums Convert the bits into enums which makes the code a little easier to maintain. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fe6ccff9cc70..5a095d6f088d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -298,15 +298,18 @@ struct tracer { #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) /* for function tracing recursion */ -#define TRACE_INTERNAL_BIT (1<<11) -#define TRACE_INTERNAL_NMI_BIT (1<<12) -#define TRACE_INTERNAL_IRQ_BIT (1<<13) -#define TRACE_INTERNAL_SIRQ_BIT (1<<14) -#define TRACE_GLOBAL_BIT (1<<15) -#define TRACE_GLOBAL_NMI_BIT (1<<16) -#define TRACE_GLOBAL_IRQ_BIT (1<<17) -#define TRACE_GLOBAL_SIRQ_BIT (1<<18) -#define TRACE_CONTROL_BIT (1<<19) +enum { + TRACE_INTERNAL_BIT = 11, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_GLOBAL_BIT, + TRACE_GLOBAL_NMI_BIT, + TRACE_GLOBAL_IRQ_BIT, + TRACE_GLOBAL_SIRQ_BIT, + + TRACE_CONTROL_BIT, /* * Abuse of the trace_recursion. @@ -315,11 +318,12 @@ struct tracer { * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ -#define TRACE_IRQ_BIT (1<<20) + TRACE_IRQ_BIT, +}; -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) #define TRACE_PIPE_ALL_CPU -1 -- cgit v1.2.3 From edc15cafcbfa3d73f819cae99885a2e35e4cbce5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:47:21 -0400 Subject: tracing: Avoid unnecessary multiple recursion checks When function tracing occurs, the following steps are made: If arch does not support a ftrace feature: call internal function (uses INTERNAL bits) which calls... If callback is registered to the "global" list, the list function is called and recursion checks the GLOBAL bits. then this function calls... The function callback, which can use the FTRACE bits to check for recursion. Now if the arch does not suppport a feature, and it calls the global list function which calls the ftrace callback all three of these steps will do a recursion protection. There's no reason to do one if the previous caller already did. The recursion that we are protecting against will go through the same steps again. To prevent the multiple recursion checks, if a recursion bit is set that is higher than the MAX bit of the current check, then we know that the check was made by the previous caller, and we can skip the current check. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 40 +++++-------------- kernel/trace/trace.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 110 insertions(+), 36 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 639b6ab1f04c..ce8c3d68292f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -158,25 +158,15 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, { int bit; - if (in_interrupt()) { - if (in_nmi()) - bit = TRACE_GLOBAL_NMI_BIT; - - else if (in_irq()) - bit = TRACE_GLOBAL_IRQ_BIT; - else - bit = TRACE_GLOBAL_SIRQ_BIT; - } else - bit = TRACE_GLOBAL_BIT; - - if (unlikely(trace_recursion_test(bit))) + bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); + if (bit < 0) return; - trace_recursion_set(bit); do_for_each_ftrace_op(op, ftrace_global_list) { op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); - trace_recursion_clear(bit); + + trace_clear_recursion(bit); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, @@ -4145,26 +4135,14 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; - unsigned int bit; + int bit; if (function_trace_stop) return; - if (in_interrupt()) { - if (in_nmi()) - bit = TRACE_INTERNAL_NMI_BIT; - - else if (in_irq()) - bit = TRACE_INTERNAL_IRQ_BIT; - else - bit = TRACE_INTERNAL_SIRQ_BIT; - } else - bit = TRACE_INTERNAL_BIT; - - if (unlikely(trace_recursion_test(bit))) - return; - - trace_recursion_set(bit); + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; /* * Some of the ops may be dynamically allocated, @@ -4176,7 +4154,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); preempt_enable_notrace(); - trace_recursion_clear(bit); + trace_clear_recursion(bit); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5a095d6f088d..c203a51dd412 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -297,18 +297,49 @@ struct tracer { /* Ring buffer has the 10 LSB bits to count */ #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) -/* for function tracing recursion */ +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not suppport a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ enum { - TRACE_INTERNAL_BIT = 11, - TRACE_INTERNAL_NMI_BIT, - TRACE_INTERNAL_IRQ_BIT, - TRACE_INTERNAL_SIRQ_BIT, + TRACE_FTRACE_BIT = 11, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + /* GLOBAL_BITs must be greater than FTRACE_BITs */ TRACE_GLOBAL_BIT, TRACE_GLOBAL_NMI_BIT, TRACE_GLOBAL_IRQ_BIT, TRACE_GLOBAL_SIRQ_BIT, + /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + TRACE_CONTROL_BIT, /* @@ -325,6 +356,71 @@ enum { #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT +#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) + return -1; + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; + + if (!bit) + return; + + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} + #define TRACE_PIPE_ALL_CPU -1 static inline struct ring_buffer_iter * -- cgit v1.2.3 From 897f68a48b1f8fb6cb7493e1ee37e3ed7f879937 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 17:52:35 -0400 Subject: ftrace: Use only the preempt version of function tracing The function tracer had two different versions of function tracing. The disabling of irqs version and the preempt disable version. As function tracing in very intrusive and can cause nasty recursion issues, it has its own recursion protection. But the old method to do this was a flat layer. If it detected that a recursion was happening then it would just return without recording. This made the preempt version (much faster than the irq disabling one) not very useful, because if an interrupt were to occur after the recursion flag was set, the interrupt would not be traced at all, because every function that was traced would think it recursed on itself (due to the context it preempted setting the recursive flag). Now that we have a recursion flag for every context level, we no longer need to worry about that. We can disable preemption, set the current context recursion check bit, and go on. If an interrupt were to come along, it would check its own context bit and happily continue to trace. As the preempt version is faster than the irq disable version, there's no more reason to keep the preempt version around. And the irq disable version still had an issue with missing out on tracing NMI code. Remove the irq disable function tracer version and have the preempt disable version be the default (and only version). Before this patch we had from running: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 12.028 Time: 11.945 Time: 11.925 Time: 11.964 Time: 12.002 Time: 11.910 Time: 11.944 Time: 11.929 Time: 11.941 Time: 11.924 (average: 11.9512) Now we have: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 10.285 Time: 10.407 Time: 10.243 Time: 10.372 Time: 10.380 Time: 10.198 Time: 10.272 Time: 10.354 Time: 10.248 Time: 10.253 (average: 10.3012) a 13.8% savings! Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 61 ++++++++++-------------------------------- 1 file changed, 14 insertions(+), 47 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 8e3ad8082ab7..1c327ef13a9a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); } -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) -{ - struct trace_array *tr = func_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - pc = preempt_count(); - preempt_disable_notrace(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, pc); - - atomic_dec(&data->disabled); - preempt_enable_notrace(); -} - /* Our option */ enum { TRACE_FUNC_OPT_STACK = 0x1, @@ -85,34 +57,34 @@ static struct tracer_flags func_flags; static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) - { struct trace_array *tr = func_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; + unsigned int bit; int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + pc = preempt_count(); + preempt_disable_notrace(); - if (likely(disabled == 1)) { - pc = preempt_count(); + bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + if (bit < 0) + goto out; + + cpu = smp_processor_id(); + data = tr->data[cpu]; + if (!atomic_read(&data->disabled)) { + local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } + trace_clear_recursion(bit); - atomic_dec(&data->disabled); - local_irq_restore(flags); + out: + preempt_enable_notrace(); } static void @@ -185,11 +157,6 @@ static void tracing_start_function_trace(void) { ftrace_function_enabled = 0; - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - if (func_flags.val & TRACE_FUNC_OPT_STACK) register_ftrace_function(&trace_stack_ops); else -- cgit v1.2.3 From 567cd4da54ff45513d2ca1f0e3cb9ba45b66d6cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Nov 2012 18:33:05 -0400 Subject: ring-buffer: User context bit recursion checking Using context bit recursion checking, we can help increase the performance of the ring buffer. Before this patch: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 10.285 Time: 10.407 Time: 10.243 Time: 10.372 Time: 10.380 Time: 10.198 Time: 10.272 Time: 10.354 Time: 10.248 Time: 10.253 (average: 10.3012) Now we have: # echo function > /debug/tracing/current_tracer # for i in `seq 10`; do ./hackbench 50; done Time: 9.712 Time: 9.824 Time: 9.861 Time: 9.827 Time: 9.962 Time: 9.905 Time: 9.886 Time: 10.088 Time: 9.861 Time: 9.834 (average: 9.876) a 4% savings! Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 85 ++++++++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 13 +++---- 2 files changed, 67 insertions(+), 31 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6ff9cc4658ed..481e26269281 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2432,41 +2432,76 @@ rb_reserve_next_event(struct ring_buffer *buffer, #ifdef CONFIG_TRACING -#define TRACE_RECURSIVE_DEPTH 16 +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + */ +static DEFINE_PER_CPU(unsigned int, current_context); -/* Keep this code out of the fast path cache */ -static noinline void trace_recursive_fail(void) +static __always_inline int trace_recursive_lock(void) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); - - printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - trace_recursion_buffer(), - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); + unsigned int val = this_cpu_read(current_context); + int bit; - WARN_ON_ONCE(1); -} - -static inline int trace_recursive_lock(void) -{ - trace_recursion_inc(); + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; - if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) - return 0; + if (unlikely(val & (1 << bit))) + return 1; - trace_recursive_fail(); + val |= (1 << bit); + this_cpu_write(current_context, val); - return -1; + return 0; } -static inline void trace_recursive_unlock(void) +static __always_inline void trace_recursive_unlock(void) { - WARN_ON_ONCE(!trace_recursion_buffer()); + unsigned int val = this_cpu_read(current_context); - trace_recursion_dec(); + val--; + val &= this_cpu_read(current_context); + this_cpu_write(current_context, val); } #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c203a51dd412..04a2c7ab1735 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -291,11 +291,6 @@ struct tracer { /* Only current can touch trace_recursion */ -#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) -#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) - -/* Ring buffer has the 10 LSB bits to count */ -#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) /* * For function tracing recursion: @@ -323,7 +318,13 @@ struct tracer { * caller, and we can skip the current check. */ enum { - TRACE_FTRACE_BIT = 11, + TRACE_BUFFER_BIT, + TRACE_BUFFER_NMI_BIT, + TRACE_BUFFER_IRQ_BIT, + TRACE_BUFFER_SIRQ_BIT, + + /* Start of function recursion bits */ + TRACE_FTRACE_BIT, TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, -- cgit v1.2.3 From 0b07436d95b5404134da4d661fd183eac863513e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jan 2013 16:58:30 -0500 Subject: ring-buffer: Remove trace.h from ring_buffer.c ring_buffer.c use to require declarations from trace.h, but these have moved to the generic header files. There's nothing in trace.h that ring_buffer.c requires. There's some headers that trace.h included that ring_buffer.c needs, but it's best that it includes them directly, and not include trace.h. Also, some things may use ring_buffer.c without having tracing configured. This removes the dependency that may come in the future. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 481e26269281..13950d9027cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3,8 +3,10 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include +#include #include #include #include @@ -21,7 +23,6 @@ #include #include -#include "trace.h" static void update_pages_handler(struct work_struct *work); -- cgit v1.2.3 From d41032a83b4683481cadff84bbf8e0eafeaba830 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 24 Jan 2013 07:52:34 -0500 Subject: tracing: Fix unsigned int compare of zero in recursion check Dan's smatch found a compare bug with the result of the trace_test_and_set_recursion() and comparing to less than zero. If the function fails, it returns -1, but was saved in an unsigned int, which will never be less than zero and will ignore the result of the test if a recursion did happen. Luckily this is the last of the recursion tests, as the infrastructure of ftrace would catch recursions before it got here, except for some few exceptions. Reported-by: Dan Carpenter Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 1c327ef13a9a..601152523326 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -61,7 +61,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = func_trace; struct trace_array_cpu *data; unsigned long flags; - unsigned int bit; + int bit; int cpu; int pc; -- cgit v1.2.3 From b736f48bda54ec75b7dc9306884c3843f1a78a0a Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 18 Nov 2012 21:27:45 -0800 Subject: tracing: Mark tracing_dentry_percpu() static Nothing outside of kernel/trace/trace.c references tracing_dentry_percpu(). Link: http://lkml.kernel.org/r/1353302917-13995-7-git-send-email-josh@joshtriplett.org Signed-off-by: Josh Triplett Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d2a658349ca1..ca9b7dfed8ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4506,7 +4506,7 @@ struct dentry *tracing_init_dentry(void) static struct dentry *d_percpu; -struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(void) { static int once; struct dentry *d_tracer; -- cgit v1.2.3 From 821465295b36136998ef294fe176fba4e09c1cd9 Mon Sep 17 00:00:00 2001 From: Shan Wei Date: Mon, 19 Nov 2012 13:21:01 +0800 Subject: tracing: Use __this_cpu_inc/dec operation instead of __get_cpu_var __this_cpu_inc_return() or __this_cpu_dec generates a single instruction, which is faster than __get_cpu_var operation. Link: http://lkml.kernel.org/r/50A9C1BD.1060308@gmail.com Reviewed-by: Christoph Lameter Signed-off-by: Shan Wei Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ca9b7dfed8ef..07888e15c694 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1344,7 +1344,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ preempt_disable_notrace(); - use_stack = ++__get_cpu_var(ftrace_stack_reserve); + use_stack = __this_cpu_inc_return(ftrace_stack_reserve); /* * We don't need any atomic variables, just a barrier. * If an interrupt comes in, we don't care, because it would @@ -1398,7 +1398,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, out: /* Again, don't let gcc optimize things here */ barrier(); - __get_cpu_var(ftrace_stack_reserve)--; + __this_cpu_dec(ftrace_stack_reserve); preempt_enable_notrace(); } -- cgit v1.2.3 From 38dbe0b137bfe6ea92be495017885c0785179a02 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Fri, 25 Jan 2013 18:03:07 +0800 Subject: tracing: Remove second iterator initializer The trace iterator is already initialized by trace_init_global_iter(), so there is no need to initialize it again. Link: http://lkml.kernel.org/r/CACV3sb+G1YnO6168JhY3dEadmJi58pA5-2cSZT8E0WVHJNFt9Q@mail.gmail.com Signed-off-by: Jovi Zhang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 07888e15c694..d399592701aa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5030,6 +5030,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (disable_tracing) ftrace_kill(); + /* Simulate the iterator */ trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { @@ -5041,10 +5042,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - /* Simulate the iterator */ - iter.tr = &global_trace; - iter.trace = current_trace; - switch (oops_dump_mode) { case DUMP_ALL: iter.cpu_file = TRACE_PIPE_ALL_CPU; -- cgit v1.2.3 From 03274a3ffb449632970fdd35da72ea41cf8474da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Jan 2013 17:30:31 -0500 Subject: tracing/fgraph: Adjust fgraph depth before calling trace return callback While debugging the virtual cputime with the function graph tracer with a max_depth of 1 (most common use of the max_depth so far), I found that I was missing kernel execution because of a race condition. The code for the return side of the function has a slight race: ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; The ftrace_pop_return_trace() initializes the trace structure for the callback. The ftrace_graph_return() uses the trace structure for its own use as that structure is on the stack and is local to this function. Then the curr_ret_stack is decremented which is what the trace.depth is set to. If an interrupt comes in after the ftrace_graph_return() but before the curr_ret_stack, then the called function will get a depth of 2. If max_depth is set to 1 this function will be ignored. The problem is that the trace has already been called, and the timestamp for that trace will not reflect the time the function was about to re-enter userspace. Calls to the interrupt will not be traced because the max_depth has prevented this. To solve this issue, the ftrace_graph_return() can safely be moved after the current->curr_ret_stack has been updated. This way the timestamp for the return callback will reflect the actual time. If an interrupt comes in after the curr_ret_stack update and ftrace_graph_return(), it will be traced. It may look a little confusing to see it within the other function, but at least it will not be lost. Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7008d2e13cf2..39ada66389cc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -191,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) ftrace_pop_return_trace(&trace, &ret, frame_pointer); trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); barrier(); current->curr_ret_stack--; + /* + * The trace should run after decrementing the ret counter + * in case an interrupt were to come in. We don't want to + * lose the interrupt if max_depth is set. + */ + ftrace_graph_return(&trace); + if (unlikely(!ret)) { ftrace_graph_stop(); WARN_ON(1); -- cgit v1.2.3 From ad964704ba9326d027fc10fd0099b7c880e50172 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 29 Jan 2013 17:45:49 -0500 Subject: ring-buffer: Add stats field for amount read from trace ring buffer Add a stat about the number of events read from the ring buffer: # cat /debug/tracing/per_cpu/cpu0/stats entries: 39869 overrun: 870512 commit overrun: 0 bytes: 1449912 oldest event ts: 6561.368690 now ts: 6565.246426 dropped events: 0 read events: 112 <-- Added Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 18 ++++++++++++++++++ kernel/trace/trace.c | 3 +++ 2 files changed, 21 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 13950d9027cb..7244acde77b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3102,6 +3102,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); +/** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d399592701aa..90a1c71fdbfc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4430,6 +4430,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); + cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); + trace_seq_printf(s, "read events: %ld\n", cnt); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); -- cgit v1.2.3 From 5e67b51e3fb22ad43faf9589e9019ad9c6a00413 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 27 Dec 2012 11:49:45 +0900 Subject: tracing: Use sched_clock_cpu for trace_clock_global For systems with an unstable sched_clock, all cpu_clock() does is enable/ disable local irq during the call to sched_clock_cpu(). And for stable systems they are same. trace_clock_global() already disables interrupts, so it can call sched_clock_cpu() directly. Link: http://lkml.kernel.org/r/1356576585-28782-2-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 22b638b28e48..24bf48eabfcc 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -84,7 +84,7 @@ u64 notrace trace_clock_global(void) local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = cpu_clock(this_cpu); + now = sched_clock_cpu(this_cpu); /* * If in an NMI context then dont risk lockups and return the * cpu_clock() time: -- cgit v1.2.3 From 2fd196ec1eab2623096e7fc7e6f3976160392bce Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 26 Dec 2012 11:52:52 +0900 Subject: tracing: Replace static old_tracer check of tracer name Currently the trace buffer read functions use a static variable "old_tracer" for detecting if the current tracer changes. This was suitable for a single trace file ("trace"), but to add a snapshot feature that will use the same function for its file, a check against a static variable is not sufficient. To use the output functions for two different files, instead of storing the current tracer in a static variable, as the trace iterator descriptor contains a pointer to the original current tracer's name, that pointer can now be used to check if the current tracer has changed between different reads of the trace file. Link: http://lkml.kernel.org/r/20121226025252.3252.9276.stgit@liselsia Signed-off-by: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 90a1c71fdbfc..2c724662a3e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1948,18 +1948,20 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; - static struct tracer *old_tracer; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; - /* copy the tracer to avoid using a global lock all around */ + /* + * copy the tracer to avoid using a global lock all around. + * iter->trace is a copy of current_trace, the pointer to the + * name may be used instead of a strcmp(), as iter->trace->name + * will point to the same string as current_trace->name. + */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); atomic_inc(&trace_record_cmdline_disabled); @@ -3494,7 +3496,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - static struct tracer *old_tracer; ssize_t sret; /* return any leftover data */ @@ -3506,10 +3507,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); /* @@ -3665,7 +3664,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - static struct tracer *old_tracer; ssize_t ret; size_t rem; unsigned int i; @@ -3675,10 +3673,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(old_tracer != current_trace && current_trace)) { - old_tracer = current_trace; + if (unlikely(current_trace && iter->trace->name != current_trace->name)) *iter->trace = *current_trace; - } mutex_unlock(&trace_types_lock); mutex_lock(&iter->mutex); -- cgit v1.2.3 From debdd57f5145f3c6a4b3f8d0126abd1a2def7fc6 Mon Sep 17 00:00:00 2001 From: Hiraku Toyooka Date: Wed, 26 Dec 2012 11:53:00 +0900 Subject: tracing: Make a snapshot feature available from userspace Ftrace has a snapshot feature available from kernel space and latency tracers (e.g. irqsoff) are using it. This patch enables user applictions to take a snapshot via debugfs. Add "snapshot" debugfs file in "tracing" directory. snapshot: This is used to take a snapshot and to read the output of the snapshot. # echo 1 > snapshot This will allocate the spare buffer for snapshot (if it is not allocated), and take a snapshot. # cat snapshot This will show contents of the snapshot. # echo 0 > snapshot This will free the snapshot if it is allocated. Any other positive values will clear the snapshot contents if the snapshot is allocated, or return EINVAL if it is not allocated. Link: http://lkml.kernel.org/r/20121226025300.3252.86850.stgit@liselsia Cc: Jiri Olsa Cc: David Sharp Signed-off-by: Hiraku Toyooka [ Fixed irqsoff selftest and also a conflict with a change that fixes the update_max_tr. ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 10 ++++ kernel/trace/trace.c | 166 +++++++++++++++++++++++++++++++++++++++++++-------- kernel/trace/trace.h | 1 + 3 files changed, 151 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cdc9d284d24e..36567564e221 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -253,6 +253,16 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. +config TRACER_SNAPSHOT + bool "Create a snapshot trace buffer" + select TRACER_MAX_TRACE + help + Allow tracing users to take snapshot of the current buffer using the + ftrace interface, e.g.: + + echo 1 > /sys/kernel/debug/tracing/snapshot + cat snapshot + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c724662a3e8..70dce64b9ecf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -710,12 +710,11 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); - /* If we disabled the tracer, stop now */ - if (current_trace == &nop_trace) - return; - - if (WARN_ON_ONCE(!current_trace->use_max_tr)) + if (!current_trace->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(current_trace != &nop_trace); return; + } arch_spin_lock(&ftrace_max_lock); @@ -743,10 +742,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->use_max_tr) { - WARN_ON_ONCE(1); + if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) return; - } arch_spin_lock(&ftrace_max_lock); @@ -866,10 +863,13 @@ int register_tracer(struct tracer *type) current_trace = type; - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, trace_buf_size, - RING_BUFFER_ALL_CPUS); + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(max_tr.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + type->allocated_snapshot = true; + } /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); @@ -885,10 +885,14 @@ int register_tracer(struct tracer *type) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); - /* Shrink the max buffer again */ - if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, 1, - RING_BUFFER_ALL_CPUS); + if (type->use_max_tr) { + type->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); + } printk(KERN_CONT "PASSED\n"); } @@ -1964,7 +1968,11 @@ static void *s_start(struct seq_file *m, loff_t *pos) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); - atomic_inc(&trace_record_cmdline_disabled); + if (iter->snapshot && iter->trace->use_max_tr) + return ERR_PTR(-EBUSY); + + if (!iter->snapshot) + atomic_inc(&trace_record_cmdline_disabled); if (*pos != iter->pos) { iter->ent = NULL; @@ -2003,7 +2011,11 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; - atomic_dec(&trace_record_cmdline_disabled); + if (iter->snapshot && iter->trace->use_max_tr) + return; + + if (!iter->snapshot) + atomic_dec(&trace_record_cmdline_disabled); trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } @@ -2438,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file) +__tracing_open(struct inode *inode, struct file *file, bool snapshot) { long cpu_file = (long) inode->i_private; struct trace_iterator *iter; @@ -2471,10 +2483,11 @@ __tracing_open(struct inode *inode, struct file *file) if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if (current_trace && current_trace->print_max) + if ((current_trace && current_trace->print_max) || snapshot) iter->tr = &max_tr; else iter->tr = &global_trace; + iter->snapshot = snapshot; iter->pos = -1; mutex_init(&iter->mutex); iter->cpu_file = cpu_file; @@ -2491,8 +2504,9 @@ __tracing_open(struct inode *inode, struct file *file) if (trace_clocks[trace_clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - /* stop the trace while dumping */ - tracing_stop(); + /* stop the trace while dumping if we are not opening "snapshot" */ + if (!iter->snapshot) + tracing_stop(); if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) { @@ -2555,8 +2569,9 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - /* reenable tracing if it was previously enabled */ - tracing_start(); + if (!iter->snapshot) + /* reenable tracing if it was previously enabled */ + tracing_start(); mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -2584,7 +2599,7 @@ static int tracing_open(struct inode *inode, struct file *file) } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file); + iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); else if (trace_flags & TRACE_ITER_LATENCY_FMT) @@ -3219,7 +3234,7 @@ static int tracing_set_tracer(const char *buf) if (current_trace && current_trace->reset) current_trace->reset(tr); - had_max_tr = current_trace && current_trace->use_max_tr; + had_max_tr = current_trace && current_trace->allocated_snapshot; current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { @@ -3238,6 +3253,8 @@ static int tracing_set_tracer(const char *buf) */ ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&max_tr, 1); + tracing_reset_online_cpus(&max_tr); + current_trace->allocated_snapshot = false; } destroy_trace_option_files(topts); @@ -3248,6 +3265,7 @@ static int tracing_set_tracer(const char *buf) RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; + t->allocated_snapshot = true; } if (t->init) { @@ -4066,6 +4084,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return single_open(file, tracing_clock_show, NULL); } +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret = 0; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } + return ret; +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + mutex_lock(&trace_types_lock); + + if (current_trace && current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } + + switch (val) { + case 0: + if (current_trace->allocated_snapshot) { + /* free spare buffer */ + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); + set_buffer_entries(&max_tr, 1); + tracing_reset_online_cpus(&max_tr); + current_trace->allocated_snapshot = false; + } + break; + case 1: + if (!current_trace->allocated_snapshot) { + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&max_tr, + &global_trace, RING_BUFFER_ALL_CPUS); + if (ret < 0) + break; + current_trace->allocated_snapshot = true; + } + + local_irq_disable(); + /* Now, we're going to swap */ + update_max_tr(&global_trace, current, smp_processor_id()); + local_irq_enable(); + break; + default: + if (current_trace->allocated_snapshot) + tracing_reset_online_cpus(&max_tr); + else + ret = -EINVAL; + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } +out: + mutex_unlock(&trace_types_lock); + return ret; +} +#endif /* CONFIG_TRACER_SNAPSHOT */ + + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -4122,6 +4221,16 @@ static const struct file_operations trace_clock_fops = { .write = tracing_clock_write, }; +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_seek, + .release = tracing_release, +}; +#endif /* CONFIG_TRACER_SNAPSHOT */ + struct ftrace_buffer_info { struct trace_array *tr; void *spare; @@ -4921,6 +5030,11 @@ static __init int tracer_init_debugfs(void) &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); +#endif + create_trace_options_dir(); for_each_tracing_cpu(cpu) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 04a2c7ab1735..57d7e5397d56 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -287,6 +287,7 @@ struct tracer { struct tracer_flags *flags; bool print_max; bool use_max_tr; + bool allocated_snapshot; }; -- cgit v1.2.3 From d840f718d28715a9833c1a8f46c2493ff3fd219b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 1 Feb 2013 18:38:47 -0500 Subject: tracing: Init current_trace to nop_trace and remove NULL checks On early boot up, when the ftrace ring buffer is initialized, the static variable current_trace is initialized to &nop_trace. Before this initialization, current_trace is NULL and will never become NULL again. It is always reassigned to a ftrace tracer. Several places check if current_trace is NULL before it uses it, and this check is frivolous, because at the point in time when the checks are made the only way current_trace could be NULL is if ftrace failed its allocations at boot up, and the paths to these locations would probably not be possible. By initializing current_trace to &nop_trace where it is declared, current_trace will never be NULL, and we can remove all these checks of current_trace being NULL which never needed to be checked in the first place. Cc: Dan Carpenter Cc: Hiraku Toyooka Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 70dce64b9ecf..5d520b7bb4c5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -249,7 +249,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; static struct tracer *trace_types __read_mostly; /* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly; +static struct tracer *current_trace __read_mostly = &nop_trace; /* * trace_types_lock is used to protect the trace_types list. @@ -2100,8 +2100,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) unsigned long total; const char *name = "preemption"; - if (type) - name = type->name; + name = type->name; get_total_entries(tr, &total, &entries); @@ -2477,13 +2476,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter->trace) goto fail; - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if ((current_trace && current_trace->print_max) || snapshot) + if (current_trace->print_max || snapshot) iter->tr = &max_tr; else iter->tr = &global_trace; @@ -3037,10 +3035,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int r; mutex_lock(&trace_types_lock); - if (current_trace) - r = sprintf(buf, "%s\n", current_trace->name); - else - r = sprintf(buf, "\n"); + r = sprintf(buf, "%s\n", current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3231,10 +3226,10 @@ static int tracing_set_tracer(const char *buf) goto out; trace_branch_disable(); - if (current_trace && current_trace->reset) + if (current_trace->reset) current_trace->reset(tr); - had_max_tr = current_trace && current_trace->allocated_snapshot; + had_max_tr = current_trace->allocated_snapshot; current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { @@ -3373,8 +3368,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) ret = -ENOMEM; goto fail; } - if (current_trace) - *iter->trace = *current_trace; + *iter->trace = *current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3525,7 +3519,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) + if (unlikely(iter->trace->name != current_trace->name)) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); @@ -3691,7 +3685,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) + if (unlikely(iter->trace->name != current_trace->name)) *iter->trace = *current_trace; mutex_unlock(&trace_types_lock); @@ -4115,7 +4109,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&trace_types_lock); - if (current_trace && current_trace->use_max_tr) { + if (current_trace->use_max_tr) { ret = -EBUSY; goto out; } @@ -5299,7 +5293,7 @@ __init static int tracer_alloc_buffers(void) init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); - current_trace = &nop_trace; + /* All seems OK, enable tracing */ tracing_disabled = 0; -- cgit v1.2.3 From fe20d71f25400cccc8bffef865f79250be7dbc81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 21 Nov 2012 17:32:30 +0100 Subject: uprobes: Kill uprobe_consumer->filter() uprobe_consumer->filter() is pointless in its current form, kill it. We will add it back, but with the different signature/semantics. Perhaps we will even re-introduce the callsite in handler_chain(), but not to just skip uc->handler(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 87b6db4ccbc5..e668024773d4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -550,7 +550,6 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; - utc->cons.filter = NULL; ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { kfree(utc); -- cgit v1.2.3 From 74e59dfc6b19e3472a7c16ad57bc831e6e647895 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Dec 2012 15:54:08 +0100 Subject: uprobes: Change handle_swbp() to expose bp_vaddr to handler_chain() Change handle_swbp() to set regs->ip = bp_vaddr in advance, this is what consumer->handler() needs but uprobe_get_swbp_addr() is not exported. This also simplifies the code and makes it more consistent across the supported architectures. handle_swbp() becomes the only caller of uprobe_get_swbp_addr(). Signed-off-by: Oleg Nesterov Acked-by: Ananth N Mavinakayanahalli --- kernel/trace/trace_uprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index e668024773d4..17d9b2bcc28d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -492,7 +492,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -667,7 +667,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!entry) goto out; - entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); + entry->ip = instruction_pointer(task_pt_regs(current)); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -- cgit v1.2.3 From 84d7ed799fd6c1366547d88ddb8188c65de3b94f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:20:45 +0100 Subject: uprobes/tracing: Fix dentry/mount leak in create_trace_uprobe() create_trace_uprobe() does kern_path() to find ->d_inode, but forgets to do path_put(). We can do this right after igrab(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 17d9b2bcc28d..06c22bad776a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -253,16 +253,18 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - ret = kstrtoul(arg, 0, &offset); - if (ret) - goto fail_address_parse; - inode = igrab(path.dentry->d_inode); + path_put(&path); + if (!S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } + ret = kstrtoul(arg, 0, &offset); + if (ret) + goto fail_address_parse; + argc -= 2; argv += 2; -- cgit v1.2.3 From 4161824f18ff4f56f46595a4016c7315dd0d24f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 27 Jan 2013 18:36:24 +0100 Subject: uprobes/tracing: Fully initialize uprobe_trace_consumer before uprobe_register() probe_event_enable() does uprobe_register() and only after that sets utc->tu and tu->consumer/flags. This can race with uprobe_dispatcher() which can miss these assignments or see them out of order. Nothing really bad can happen, but this doesn't look clean/safe. And this does not allow to use uprobe_consumer->filter() we are going to add, it is called by uprobe_register() and it needs utc->tu. Change this code to initialize everything before uprobe_register(), and reset tu->consumer/flags if it fails. We can't race with event_disable(), the caller holds event_mutex, and if we could the code would be wrong anyway. In fact I think uprobe_trace_consumer should die, it buys nothing but complicates the code. We can simply add uprobe_consumer into trace_uprobe. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 06c22bad776a..15b8eceeddc5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -552,17 +552,18 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) return -EINTR; utc->cons.handler = uprobe_dispatcher; + utc->tu = tu; + tu->consumer = utc; + tu->flags |= flag; + ret = uprobe_register(tu->inode, tu->offset, &utc->cons); if (ret) { + tu->consumer = NULL; + tu->flags &= ~flag; kfree(utc); - return ret; } - tu->flags |= flag; - utc->tu = tu; - tu->consumer = utc; - - return 0; + return ret; } static void probe_event_disable(struct trace_uprobe *tu, int flag) -- cgit v1.2.3 From 7e4e28c53963e6cfa94d8109bb8f5233c5659048 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 28 Jan 2013 17:08:47 +0100 Subject: uprobes/tracing: Ensure inode != NULL in create_trace_uprobe() probe_event_enable/disable() check tu->inode != NULL at the start. This is ugly, if igrab() can fail create_trace_uprobe() should not succeed and "postpone" the failure. And S_ISREG(inode->i_mode) check added by d24d7dbf is not safe. Note: alloc_uprobe() should probably check igrab() != NULL as well. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 15b8eceeddc5..f7838cfd61b9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -256,7 +256,7 @@ static int create_trace_uprobe(int argc, char **argv) inode = igrab(path.dentry->d_inode); path_put(&path); - if (!S_ISREG(inode->i_mode)) { + if (!inode || !S_ISREG(inode->i_mode)) { ret = -EINVAL; goto fail_address_parse; } @@ -544,7 +544,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) struct uprobe_trace_consumer *utc; int ret = 0; - if (!tu->inode || tu->consumer) + if (tu->consumer) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +568,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->inode || !tu->consumer) + if (!tu->consumer) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v1.2.3 From b64b007797c1e6d6b745c93c296ba1d5f4d72d86 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:15:30 +0100 Subject: uprobes/tracing: Introduce is_trace_uprobe_enabled() probe_event_enable/disable() check tu->consumer != NULL to avoid the wrong uprobe_register/unregister(). We are going to kill this pointer and "struct uprobe_trace_consumer", so we add the new helper, is_trace_uprobe_enabled(), which can rely on TP_FLAG_TRACE/TP_FLAG_PROFILE instead. Note: the current logic doesn't look optimal, it is not clear why TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive, we will probably change this later. Also kill the unused TP_FLAG_UPROBE. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_probe.h | 1 - kernel/trace/trace_uprobe.c | 9 +++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 933708677814..5c7e09d10d74 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -66,7 +66,6 @@ #define TP_FLAG_TRACE 1 #define TP_FLAG_PROFILE 2 #define TP_FLAG_REGISTERED 4 -#define TP_FLAG_UPROBE 8 /* data_rloc: data relative location, compatible with u32 */ diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f7838cfd61b9..d6c6e2a345a7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -539,12 +539,17 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } +static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) +{ + return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); +} + static int probe_event_enable(struct trace_uprobe *tu, int flag) { struct uprobe_trace_consumer *utc; int ret = 0; - if (tu->consumer) + if (is_trace_uprobe_enabled(tu)) return -EINTR; utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); @@ -568,7 +573,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) static void probe_event_disable(struct trace_uprobe *tu, int flag) { - if (!tu->consumer) + if (!is_trace_uprobe_enabled(tu)) return; uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); -- cgit v1.2.3 From a932b7381f81235530c3d0acbd3ba2c7537d78e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:47:23 +0100 Subject: uprobes/tracing: Kill uprobe_trace_consumer, embed uprobe_consumer into trace_uprobe trace_uprobe->consumer and "struct uprobe_trace_consumer" add the unnecessary indirection and complicate the code for no reason. This patch simply embeds uprobe_consumer into "struct trace_uprobe", all other changes only fix the compilation errors. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d6c6e2a345a7..9c8babbfd11b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -31,17 +31,11 @@ /* * uprobe event core functions */ -struct trace_uprobe; -struct uprobe_trace_consumer { - struct uprobe_consumer cons; - struct trace_uprobe *tu; -}; - struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; - struct uprobe_trace_consumer *consumer; + struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; @@ -92,6 +86,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) goto error; INIT_LIST_HEAD(&tu->list); + tu->consumer.handler = uprobe_dispatcher; return tu; error: @@ -546,27 +541,15 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) static int probe_event_enable(struct trace_uprobe *tu, int flag) { - struct uprobe_trace_consumer *utc; int ret = 0; if (is_trace_uprobe_enabled(tu)) return -EINTR; - utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); - if (!utc) - return -EINTR; - - utc->cons.handler = uprobe_dispatcher; - utc->tu = tu; - tu->consumer = utc; tu->flags |= flag; - - ret = uprobe_register(tu->inode, tu->offset, &utc->cons); - if (ret) { - tu->consumer = NULL; + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + if (ret) tu->flags &= ~flag; - kfree(utc); - } return ret; } @@ -576,10 +559,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; - kfree(tu->consumer); - tu->consumer = NULL; } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -717,13 +698,9 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { - struct uprobe_trace_consumer *utc; struct trace_uprobe *tu; - utc = container_of(con, struct uprobe_trace_consumer, cons); - tu = utc->tu; - if (!tu || tu->consumer != utc) - return 0; + tu = container_of(con, struct trace_uprobe, consumer); if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v1.2.3 From 1b47aefd9b6bd439a4be43c47acd22987ac22db8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 Jan 2013 19:55:27 +0100 Subject: uprobes/perf: Always increment trace_uprobe->nhit Move tu->nhit++ from uprobe_trace_func() to uprobe_dispatcher(). ->nhit counts how many time we hit the breakpoint inserted by this uprobe, we do not want to loose this info if uprobe was enabled by sys_perf_event_open(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/trace/trace_uprobe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9c8babbfd11b..c4e29e19fdd7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -476,8 +476,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) unsigned long irq_flags; struct ftrace_event_call *call = &tu->call; - tu->nhit++; - local_save_flags(irq_flags); pc = preempt_count(); @@ -701,6 +699,7 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) struct trace_uprobe *tu; tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; if (tu->flags & TP_FLAG_TRACE) uprobe_trace_func(tu, regs); -- cgit v1.2.3 From 736288ba5016e255869c26296014eeff649971c2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 Feb 2013 20:58:35 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to track the active perf_event's Introduce "struct trace_uprobe_filter" which records the "active" perf_event's attached to ftrace_event_call. For the start we simply use list_head, we can optimize this later if needed. For example, we do not really need to record an event with ->parent != NULL, we can rely on parent->child_list. And we can certainly do some optimizations for the case when 2 events have the same ->tp_target or tp_target->mm. Change trace_uprobe_register() to process TRACE_REG_PERF_OPEN/CLOSE and add/del this perf_event to the list. We can probably avoid any locking, but lets start with the "obvioulsy correct" trace_uprobe_filter->rwlock which protects everything. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c4e29e19fdd7..2a74a93afdae 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,12 @@ #define UPROBE_EVENT_SYSTEM "uprobes" +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* * uprobe event core functions */ @@ -35,6 +41,7 @@ struct trace_uprobe { struct list_head list; struct ftrace_event_class class; struct ftrace_event_call call; + struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct inode *inode; char *filename; @@ -58,6 +65,18 @@ static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ @@ -87,6 +106,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) INIT_LIST_HEAD(&tu->list); tu->consumer.handler = uprobe_dispatcher; + init_trace_uprobe_filter(&tu->filter); return tu; error: @@ -544,6 +564,8 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) if (is_trace_uprobe_enabled(tu)) return -EINTR; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + tu->flags |= flag; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) @@ -557,6 +579,8 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) if (!is_trace_uprobe_enabled(tu)) return; + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->flags &= ~flag; } @@ -632,6 +656,30 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_add(&event->hw.tp_list, &tu->filter.perf_events); + else + tu->filter.nr_systemwide++; + write_unlock(&tu->filter.rwlock); + + return 0; +} + +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +{ + write_lock(&tu->filter.rwlock); + if (event->hw.tp_target) + list_del(&event->hw.tp_list); + else + tu->filter.nr_systemwide--; + write_unlock(&tu->filter.rwlock); + + return 0; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -687,6 +735,13 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(tu, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(tu, data); + #endif default: return 0; -- cgit v1.2.3 From 31ba334836c0ac0039084859f14a5b96858493dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:11:58 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to pre-filter Finally implement uprobe_perf_filter() which checks ->nr_systemwide or ->perf_events to figure out whether we need to insert the breakpoint. uprobe_perf_open/close are changed to do uprobe_apply(true/false) when the new perf event comes or goes away. Note that currently this is very suboptimal: - uprobe_register() called by TRACE_REG_PERF_REGISTER becomes a heavy nop, consumer->filter() always returns F at this stage. As it was already discussed we need uprobe_register_only() to avoid the costly register_for_each_vma() when possible. - uprobe_apply() is oftenly overkill. Unless "nr_systemwide != 0" changes we need uprobe_apply_mm(), unapply_uprobe() is almost what we need. - uprobe_apply() can be simply avoided sometimes, see the next changes. Testing: # perf probe -x /lib/libc.so.6 syscall # perl -e 'syscall -1 while 1' & [1] 530 # perf record -e probe_libc:syscall perl -e 'syscall -1 for 1..10; sleep 1' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 79291 A huge ->nrhit == 79291 reflects the fact that the background process 530 constantly hits this breakpoint too, even if doesn't contribute to the output. After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 10 This shows that only the target process was punished by int3. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 46 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a74a93afdae..b7850f535acf 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -557,7 +557,12 @@ static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); } -static int probe_event_enable(struct trace_uprobe *tu, int flag) +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); + +static int +probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) { int ret = 0; @@ -567,6 +572,7 @@ static int probe_event_enable(struct trace_uprobe *tu, int flag) WARN_ON(!uprobe_filter_is_empty(&tu->filter)); tu->flags |= flag; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); if (ret) tu->flags &= ~flag; @@ -656,6 +662,22 @@ static int set_print_fmt(struct trace_uprobe *tu) } #ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.tp_target->mm == mm) + return true; + } + + return false; +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { write_lock(&tu->filter.rwlock); @@ -665,6 +687,8 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide++; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + return 0; } @@ -677,9 +701,25 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) tu->filter.nr_systemwide--; write_unlock(&tu->filter.rwlock); + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + return 0; } +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + read_lock(&tu->filter.rwlock); + ret = __uprobe_perf_filter(&tu->filter, mm); + read_unlock(&tu->filter.rwlock); + + return ret; +} + /* uprobe profile handler */ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { @@ -722,7 +762,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE); + return probe_event_enable(tu, TP_FLAG_TRACE, NULL); case TRACE_REG_UNREGISTER: probe_event_disable(tu, TP_FLAG_TRACE); @@ -730,7 +770,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE); + return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: probe_event_disable(tu, TP_FLAG_PROFILE); -- cgit v1.2.3 From f42d24a1d20d2e72d1e5d48930f18b138dfad117 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 17:48:34 +0100 Subject: uprobes/perf: Teach trace_uprobe/perf code to use UPROBE_HANDLER_REMOVE Change uprobe_trace_func() and uprobe_perf_func() to return "int". Change uprobe_dispatcher() to return "trace_ret | perf_ret" although this is not needed, currently TP_FLAG_TRACE/TP_FLAG_PROFILE are mutually exclusive. The only functional change is that uprobe_perf_func() checks the filtering too and returns UPROBE_HANDLER_REMOVE if nobody wants to trace current. Testing: # perf probe -x /lib/libc.so.6 syscall # perf record -e probe_libc:syscall -i perl -e 'fork; syscall -1 for 1..10; wait' # perf report --show-total-period 100.00% 10 perl libc-2.8.so [.] syscall Before this patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 20 A child process doesn't have a counter, but still it hits this breakoint "copied" by dup_mmap(). After the patch: # cat /sys/kernel/debug/tracing/uprobe_profile /lib/libc.so.6 syscall 11 The child process hits this int3 only once and does unapply_uprobe(). Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b7850f535acf..2399f1416555 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -486,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = { }; /* uprobe handler */ -static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -504,7 +504,7 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); entry->ip = instruction_pointer(task_pt_regs(current)); @@ -514,6 +514,8 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; } /* Event entry printers */ @@ -721,7 +723,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, } /* uprobe profile handler */ -static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; @@ -730,11 +732,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) int size, __size, i; int rctx; + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + __size = sizeof(*entry) + tu->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return; + return 0; preempt_disable(); @@ -752,6 +757,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) out: preempt_enable(); + return 0; } #endif /* CONFIG_PERF_EVENTS */ @@ -792,18 +798,19 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; + int ret = 0; tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; if (tu->flags & TP_FLAG_TRACE) - uprobe_trace_func(tu, regs); + ret |= uprobe_trace_func(tu, regs); #ifdef CONFIG_PERF_EVENTS if (tu->flags & TP_FLAG_PROFILE) - uprobe_perf_func(tu, regs); + ret |= uprobe_perf_func(tu, regs); #endif - return 0; + return ret; } static struct trace_event_functions uprobe_funcs = { -- cgit v1.2.3 From b2fe8ba674e8acbb9e8e63510b802c6d054d88a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2013 19:05:43 +0100 Subject: uprobes/perf: Avoid uprobe_apply() whenever possible uprobe_perf_open/close call the costly uprobe_apply() every time, we can avoid it if: - "nr_systemwide != 0" is not changed. - There is another process/thread with the same ->mm. - copy_proccess() does inherit_event(). dup_mmap() preserves the inserted breakpoints. - event->attr.enable_on_exec == T, we can rely on uprobe_mmap() called by exec/mmap paths. - tp_target is exiting. Only _close() checks PF_EXITING, I don't think TRACE_REG_PERF_OPEN can hit the dying task too often. Signed-off-by: Oleg Nesterov --- kernel/trace/trace_uprobe.c | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2399f1416555..8dad2a92dee9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -680,30 +680,60 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return false; } +static inline bool +uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +{ + return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); +} + static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = tu->filter.nr_systemwide || + event->parent || event->attr.enable_on_exec || + uprobe_filter_event(tu, event); list_add(&event->hw.tp_list, &tu->filter.perf_events); - else + } else { + done = tu->filter.nr_systemwide; tu->filter.nr_systemwide++; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); return 0; } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) { + bool done; + write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) + if (event->hw.tp_target) { list_del(&event->hw.tp_list); - else + done = tu->filter.nr_systemwide || + (event->hw.tp_target->flags & PF_EXITING) || + uprobe_filter_event(tu, event); + } else { tu->filter.nr_systemwide--; + done = tu->filter.nr_systemwide; + } write_unlock(&tu->filter.rwlock); - uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + if (!done) + uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); return 0; } -- cgit v1.2.3