From 9b33fa6ba0e2f90fdf407501db801c2511121564 Mon Sep 17 00:00:00 2001 From: "eranian@google.com" Date: Wed, 10 Mar 2010 22:26:05 -0800 Subject: perf_events: Improve task_sched_in() This patch is an optimization in perf_event_task_sched_in() to avoid scheduling the events twice in a row. Without it, the perf_disable()/perf_enable() pair is invoked twice, thereby pinned events counts while scheduling flexible events and we go throuh hw_perf_enable() twice. By encapsulating, the whole sequence into perf_disable()/perf_enable() we ensure, hw_perf_enable() is going to be invoked only once because of the refcount protection. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1268288765-5326-1-git-send-email-eranian@google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 52c69a34d697..3853d49c7d56 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1368,6 +1368,8 @@ void perf_event_task_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) return; + perf_disable(); + /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1380,6 +1382,8 @@ void perf_event_task_sched_in(struct task_struct *task) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); cpuctx->task_ctx = ctx; + + perf_enable(); } #define MAX_INTERRUPTS (~0ULL) -- cgit v1.2.3 From 1d199b1ad606ae8b88acebd295b101c4e1cf2a57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 16 Mar 2010 01:05:02 +0100 Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs perf_arch_fetch_caller_regs() is exported for the overriden x86 version, but not for the generic weak version. As a general rule, weak functions should not have their symbol exported in the same file they are defined. So let's export it on trace_event_perf.c as it is used by trace events only. This fixes: ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined! ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined! -v2: And also only build it if trace events are enabled. -v3: Fix changelog mistake Reported-by: Stephen Rothwell Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Paul Mackerras LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 ++ kernel/trace/trace_event_perf.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8bf61273c58b..455393e71cab 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2790,10 +2790,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } +#ifdef CONFIG_EVENT_TRACING __weak void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { } +#endif /* * Output diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0709e4f75114..7d79a10c3cde 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -12,6 +12,8 @@ DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); +EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); + static char *perf_trace_buf; static char *perf_trace_buf_nmi; -- cgit v1.2.3 From faa4602e47690fb11221e00f9b9697c8dc0d4b19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Mar 2010 14:51:50 +0100 Subject: x86, perf, bts, mm: Delete the never used BTS-ptrace code Support for the PMU's BTS features has been upstreamed in v2.6.32, but we still have the old and disabled ptrace-BTS, as Linus noticed it not so long ago. It's buggy: TIF_DEBUGCTLMSR is trampling all over that MSR without regard for other uses (perf) and doesn't provide the flexibility needed for perf either. Its users are ptrace-block-step and ptrace-bts, since ptrace-bts was never used and ptrace-block-step can be implemented using a much simpler approach. So axe all 3000 lines of it. That includes the *locked_memory*() APIs in mm/mlock.c as well. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Roland McGrath Cc: Oleg Nesterov Cc: Markus Metzger Cc: Steven Rostedt Cc: Andrew Morton LKML-Reference: <20100325135413.938004390@chello.nl> Signed-off-by: Ingo Molnar --- kernel/fork.c | 3 - kernel/ptrace.c | 1 - kernel/sched.c | 43 ------ kernel/trace/Kconfig | 11 -- kernel/trace/Makefile | 1 - kernel/trace/trace.h | 4 - kernel/trace/trace_entries.h | 12 -- kernel/trace/trace_hw_branches.c | 312 --------------------------------------- kernel/trace/trace_selftest.c | 57 ------- 9 files changed, 444 deletions(-) delete mode 100644 kernel/trace/trace_hw_branches.c (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4799c5f0e6d0..d67f1dbfbe03 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1108,9 +1108,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif - - p->bts = NULL; - p->stack_start = stack_start; /* Perform scheduler related setup. Assign this task to a CPU. */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 42ad8ae729a0..9fb51237b18c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -76,7 +76,6 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); - arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } diff --git a/kernel/sched.c b/kernel/sched.c index 9ab3cd7858d3..117b7cad31b3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2076,49 +2076,6 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) return 1; } -/* - * wait_task_context_switch - wait for a thread to complete at least one - * context switch. - * - * @p must not be current. - */ -void wait_task_context_switch(struct task_struct *p) -{ - unsigned long nvcsw, nivcsw, flags; - int running; - struct rq *rq; - - nvcsw = p->nvcsw; - nivcsw = p->nivcsw; - for (;;) { - /* - * The runqueue is assigned before the actual context - * switch. We need to take the runqueue lock. - * - * We could check initially without the lock but it is - * very likely that we need to take the lock in every - * iteration. - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); - task_rq_unlock(rq, &flags); - - if (likely(!running)) - break; - /* - * The switch count is incremented before the actual - * context switch. We thus wait for two switches to be - * sure at least one completed. - */ - if ((p->nvcsw - nvcsw) > 1) - break; - if ((p->nivcsw - nivcsw) > 1) - break; - - cpu_relax(); - } -} - /* * wait_task_inactive - wait for a thread to unschedule. * diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 13e13d428cd3..8b1797c4545b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD help See Documentation/trace/ftrace-design.txt -config HAVE_HW_BRANCH_TRACER - bool - config HAVE_SYSCALL_TRACEPOINTS bool help @@ -374,14 +371,6 @@ config STACK_TRACER Say N if unsure. -config HW_BRANCH_TRACER - depends on HAVE_HW_BRANCH_TRACER - bool "Trace hw branches" - select GENERIC_TRACER - help - This tracer records all branches on the system in a circular - buffer, giving access to the last N branches for each cpu. - config KMEMTRACE bool "Trace SLAB allocations" select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 78edc6490038..ffb1a5b0550e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2825ef2c0b15..bec2c973ff0c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,7 +34,6 @@ enum trace_type { TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, @@ -229,7 +228,6 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ @@ -467,8 +465,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c16a08f399df..dc008c1240da 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(hw_branch, hw_branch_entry, - - TRACE_HW_BRANCHES, - - F_STRUCT( - __field( u64, from ) - __field( u64, to ) - ), - - F_printk("from: %llx to: %llx", __entry->from, __entry->to) -); - FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, TRACE_KMEM_ALLOC, diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c deleted file mode 100644 index 7b97000745f5..000000000000 --- a/kernel/trace/trace_hw_branches.c +++ /dev/null @@ -1,312 +0,0 @@ -/* - * h/w branch tracer for x86 based on BTS - * - * Copyright (C) 2008-2009 Intel Corporation. - * Markus Metzger , 2008-2009 - */ -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "trace_output.h" -#include "trace.h" - - -#define BTS_BUFFER_SIZE (1 << 13) - -static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer); -static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer); - -#define this_tracer per_cpu(hwb_tracer, smp_processor_id()) - -static int trace_hw_branches_enabled __read_mostly; -static int trace_hw_branches_suspended __read_mostly; -static struct trace_array *hw_branch_trace __read_mostly; - - -static void bts_trace_init_cpu(int cpu) -{ - per_cpu(hwb_tracer, cpu) = - ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu), - BTS_BUFFER_SIZE, NULL, (size_t)-1, - BTS_KERNEL); - - if (IS_ERR(per_cpu(hwb_tracer, cpu))) - per_cpu(hwb_tracer, cpu) = NULL; -} - -static int bts_trace_init(struct trace_array *tr) -{ - int cpu; - - hw_branch_trace = tr; - trace_hw_branches_enabled = 0; - - get_online_cpus(); - for_each_online_cpu(cpu) { - bts_trace_init_cpu(cpu); - - if (likely(per_cpu(hwb_tracer, cpu))) - trace_hw_branches_enabled = 1; - } - trace_hw_branches_suspended = 0; - put_online_cpus(); - - /* If we could not enable tracing on a single cpu, we fail. */ - return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - if (likely(per_cpu(hwb_tracer, cpu))) { - ds_release_bts(per_cpu(hwb_tracer, cpu)); - per_cpu(hwb_tracer, cpu) = NULL; - } - } - trace_hw_branches_enabled = 0; - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_start(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_resume_bts(per_cpu(hwb_tracer, cpu)); - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_stop(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - trace_hw_branches_suspended = 1; - put_online_cpus(); -} - -static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - /* The notification is sent with interrupts enabled. */ - if (trace_hw_branches_enabled) { - bts_trace_init_cpu(cpu); - - if (trace_hw_branches_suspended && - likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - } - break; - - case CPU_DOWN_PREPARE: - /* The notification is sent with interrupts enabled. */ - if (likely(per_cpu(hwb_tracer, cpu))) { - ds_release_bts(per_cpu(hwb_tracer, cpu)); - per_cpu(hwb_tracer, cpu) = NULL; - } - } - - return NOTIFY_DONE; -} - -static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { - .notifier_call = bts_hotcpu_handler -}; - -static void bts_trace_print_header(struct seq_file *m) -{ - seq_puts(m, "# CPU# TO <- FROM\n"); -} - -static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) -{ - unsigned long symflags = TRACE_ITER_SYM_OFFSET; - struct trace_entry *entry = iter->ent; - struct trace_seq *seq = &iter->seq; - struct hw_branch_entry *it; - - trace_assign_type(it, entry); - - if (entry->type == TRACE_HW_BRANCHES) { - if (trace_seq_printf(seq, "%4d ", iter->cpu) && - seq_print_ip_sym(seq, it->to, symflags) && - trace_seq_printf(seq, "\t <- ") && - seq_print_ip_sym(seq, it->from, symflags) && - trace_seq_printf(seq, "\n")) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_UNHANDLED; -} - -void trace_hw_branch(u64 from, u64 to) -{ - struct ftrace_event_call *call = &event_hw_branch; - struct trace_array *tr = hw_branch_trace; - struct ring_buffer_event *event; - struct ring_buffer *buf; - struct hw_branch_entry *entry; - unsigned long irq1; - int cpu; - - if (unlikely(!tr)) - return; - - if (unlikely(!trace_hw_branches_enabled)) - return; - - local_irq_save(irq1); - cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) - goto out; - - buf = tr->buffer; - event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, from); - entry->ent.type = TRACE_HW_BRANCHES; - entry->from = from; - entry->to = to; - if (!filter_check_discard(call, entry, buf, event)) - trace_buffer_unlock_commit(buf, event, 0, 0); - - out: - atomic_dec(&tr->data[cpu]->disabled); - local_irq_restore(irq1); -} - -static void trace_bts_at(const struct bts_trace *trace, void *at) -{ - struct bts_struct bts; - int err = 0; - - WARN_ON_ONCE(!trace->read); - if (!trace->read) - return; - - err = trace->read(this_tracer, at, &bts); - if (err < 0) - return; - - switch (bts.qualifier) { - case BTS_BRANCH: - trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); - break; - } -} - -/* - * Collect the trace on the current cpu and write it into the ftrace buffer. - * - * pre: tracing must be suspended on the current cpu - */ -static void trace_bts_cpu(void *arg) -{ - struct trace_array *tr = (struct trace_array *)arg; - const struct bts_trace *trace; - unsigned char *at; - - if (unlikely(!tr)) - return; - - if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) - return; - - if (unlikely(!this_tracer)) - return; - - trace = ds_read_bts(this_tracer); - if (!trace) - return; - - for (at = trace->ds.top; (void *)at < trace->ds.end; - at += trace->ds.size) - trace_bts_at(trace, at); - - for (at = trace->ds.begin; (void *)at < trace->ds.top; - at += trace->ds.size) - trace_bts_at(trace, at); -} - -static void trace_bts_prepare(struct trace_iterator *iter) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - /* - * We need to collect the trace on the respective cpu since ftrace - * implicitly adds the record for the current cpu. - * Once that is more flexible, we could collect the data from any cpu. - */ - on_each_cpu(trace_bts_cpu, iter->tr, 1); - - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_resume_bts(per_cpu(hwb_tracer, cpu)); - put_online_cpus(); -} - -static void trace_bts_close(struct trace_iterator *iter) -{ - tracing_reset_online_cpus(iter->tr); -} - -void trace_hw_branch_oops(void) -{ - if (this_tracer) { - ds_suspend_bts_noirq(this_tracer); - trace_bts_cpu(hw_branch_trace); - ds_resume_bts_noirq(this_tracer); - } -} - -struct tracer bts_tracer __read_mostly = -{ - .name = "hw-branch-tracer", - .init = bts_trace_init, - .reset = bts_trace_reset, - .print_header = bts_trace_print_header, - .print_line = bts_trace_print_line, - .start = bts_trace_start, - .stop = bts_trace_stop, - .open = trace_bts_prepare, - .close = trace_bts_close, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_hw_branches, -#endif /* CONFIG_FTRACE_SELFTEST */ -}; - -__init static int init_bts_trace(void) -{ - register_hotcpu_notifier(&bts_hotcpu_notifier); - return register_tracer(&bts_tracer); -} -device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 280fea470d67..a7084e7c0427 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -16,7 +16,6 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_HW_BRANCHES: case TRACE_KSYM: return 1; } @@ -754,62 +753,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_HW_BRANCH_TRACER -int -trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr) -{ - struct trace_iterator *iter; - struct tracer tracer; - unsigned long count; - int ret; - - if (!trace->open) { - printk(KERN_CONT "missing open function..."); - return -1; - } - - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* - * The hw-branch tracer needs to collect the trace from the various - * cpu trace buffers - before tracing is stopped. - */ - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - memcpy(&tracer, trace, sizeof(tracer)); - - iter->trace = &tracer; - iter->tr = tr; - iter->pos = -1; - mutex_init(&iter->mutex); - - trace->open(iter); - - mutex_destroy(&iter->mutex); - kfree(iter); - - tracing_stop(); - - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT "no entries found.."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_HW_BRANCH_TRACER */ - #ifdef CONFIG_KSYM_TRACER static int ksym_selftest_dummy; -- cgit v1.2.3 From 3326c1ceee234e63160852720d48be8a8f7a6d08 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Mar 2010 19:09:33 +0100 Subject: perf_event: Make perf fd non seekable Perf_event does not need seeking, so prevent it in order to get rid of default_llseek, which uses the BKL. Signed-off-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar [drop the nonseekable_open, not needed for anon inodes] Signed-off-by: Frederic Weisbecker --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 63fbce1c80b5..4aa50ff4efc0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2645,6 +2645,7 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { + .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, -- cgit v1.2.3 From 76e1d9047e4edefb8ada20aa90d5762306082bd6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 5 Apr 2010 15:35:57 +0200 Subject: perf: Store active software events in a hashlist Each time a software event triggers, we need to walk through the entire list of events from the current cpu and task contexts to retrieve a running perf event that matches. We also need to check a matching perf event is actually counting. This walk is wasteful and makes the event fast path scaling down with a growing number of events running on the same contexts. To solve this, we store the running perf events in a hashlist to get an immediate access to them against their type:event_id when they trigger. v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic maths along the way) - Only allocate hlist for online cpus, but keep track of the refcount on offline possible cpus too, so that we allocate it if needed when it becomes online. - Drop the kref use as it's not adapted to our tricks anymore. v3: - Fix bad refcount check (address instead of value). Thanks to Eric Dumazet who spotted this. - While exiting cpu, move the hlist release out of the IPI path to lock the hlist mutex sanely. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- kernel/perf_event.c | 246 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 183 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fcf42dcd6089..9efdfe5b8d3b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, nmi, data, regs); } -static int perf_swevent_is_counting(struct perf_event *event) -{ - /* - * The event is active, we're good! - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - return 1; - - /* - * The event is off/error, not counting. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE) - return 0; - - /* - * The event is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (event->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data); @@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - if (event->cpu != -1 && event->cpu != smp_processor_id()) - return 0; - - if (!perf_swevent_is_counting(event)) - return 0; - if (event->attr.type != type) return 0; @@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event, return 1; } -static void perf_swevent_ctx_event(struct perf_event_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static struct hlist_head * +find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) +{ + u64 hash; + struct swevent_hlist *hlist; + + hash = swevent_hash(type, event_id); + + hlist = rcu_dereference(ctx->swevent_hlist); + if (!hlist) + return NULL; + + return &hlist->heads[hash]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { + struct perf_cpu_context *cpuctx; struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + cpuctx = &__get_cpu_var(perf_cpu_context); + + rcu_read_lock(); + + head = find_swevent_head(cpuctx, type, event_id); + + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } +end: + rcu_read_unlock(); } int perf_swevent_get_recursion_context(void) @@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx) } EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, - nr, nmi, data, regs); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); - rcu_read_unlock(); -} void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_enable(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct perf_cpu_context *cpuctx; + struct hlist_head *head; + + cpuctx = &__get_cpu_var(perf_cpu_context); if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } + + head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + return 0; } static void perf_swevent_disable(struct perf_event *event) { + hlist_del_rcu(&event->hlist_entry); } static const struct pmu perf_ops_generic = { @@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event, return 0; } +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) +{ + struct swevent_hlist *hlist; + + hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); + kfree(hlist); +} + +static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +{ + struct swevent_hlist *hlist; + + if (!cpuctx->swevent_hlist) + return; + + hlist = cpuctx->swevent_hlist; + rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + + mutex_lock(&cpuctx->hlist_mutex); + + if (!--cpuctx->hlist_refcount) + swevent_hlist_release(cpuctx); + + mutex_unlock(&cpuctx->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + int err = 0; + + mutex_lock(&cpuctx->hlist_mutex); + + if (!cpuctx->swevent_hlist && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + cpuctx->hlist_refcount++; + exit: + mutex_unlock(&cpuctx->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; + fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_disable(event->attr.config); + swevent_hlist_put(event); } static const struct pmu *tp_perf_event_init(struct perf_event *event) { + int err; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return NULL; event->destroy = tp_perf_event_destroy; + err = swevent_hlist_get(event); + if (err) { + perf_trace_disable(event->attr.config); + return ERR_PTR(err); + } return &perf_ops_generic; } @@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); atomic_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); } static const struct pmu *sw_perf_event_init(struct perf_event *event) @@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_ALIGNMENT_FAULTS: case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return ERR_PTR(err); + atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); + mutex_init(&cpuctx->hlist_mutex); __perf_event_init_context(&cpuctx->ctx, NULL); } } @@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu) spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); + + mutex_lock(&cpuctx->hlist_mutex); + if (cpuctx->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + WARN_ON_ONCE(!hlist); + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + mutex_unlock(&cpuctx->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU @@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu) struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_event_context *ctx = &cpuctx->ctx; + mutex_lock(&cpuctx->hlist_mutex); + swevent_hlist_release(cpuctx); + mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); mutex_unlock(&ctx->mutex); -- cgit v1.2.3 From df8290bf7ea6b3051e2f315579a6e829309ec1ed Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 9 Apr 2010 00:28:14 +0200 Subject: perf: Make clock software events consistent with general exclusion rules The cpu/task clock events implement their own version of exclusion on top of exclude_user and exclude_kernel. The result is that when the event triggered in the kernel but we have exclude_kernel set, we try to rewind using task_pt_regs. There are two side effects of this: - we call task_pt_regs even on kernel threads, which doesn't give us the desired result. - if the event occured in the kernel, we shouldn't rewind to the user context. We want to actually ignore the event. get_irq_regs() will always give us the right interrupted context, so use its result and submit it to perf_exclude_context() that knows when an event must be ignored. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar --- kernel/perf_event.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9efdfe5b8d3b..095101d685bc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4164,15 +4164,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((event->attr.exclude_kernel || !regs) && - !event->attr.exclude_user) - regs = task_pt_regs(current); - if (regs) { + if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) if (perf_event_overflow(event, 0, &data, regs)) ret = HRTIMER_NORESTART; -- cgit v1.2.3 From 93ccae7a2227466a0d071fe52c51319f2f34c365 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 12 Apr 2010 13:17:08 -0400 Subject: tracing/kprobes: Support basic types on dynamic events Support basic types of integer (u8, u16, u32, u64, s8, s16, s32, s64) in kprobe tracer. With this patch, users can specify above basic types on each arguments after ':'. If omitted, the argument type is set as unsigned long (u32 or u64, arch-dependent). e.g. echo 'p account_system_time+0 hardirq_offset=%si:s32' > kprobe_events adds a probe recording hardirq_offset in signed-32bits value on the entry of account_system_time. Cc: Ingo Molnar Cc: Steven Rostedt Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frederic Weisbecker LKML-Reference: <20100412171708.3790.18599.stgit@localhost6.localdomain6> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace.h | 16 +- kernel/trace/trace_kprobe.c | 535 +++++++++++++++++++++++++++----------------- 2 files changed, 331 insertions(+), 220 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bec2c973ff0c..3ebdb6bd2362 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -102,29 +102,17 @@ struct syscall_trace_exit { long ret; }; -struct kprobe_trace_entry { +struct kprobe_trace_entry_head { struct trace_entry ent; unsigned long ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - -struct kretprobe_trace_entry { +struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; unsigned long ret_ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kretprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1251e367bae9..a7514326052b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include "trace.h" #include "trace_output.h" @@ -40,7 +42,6 @@ /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_NARGS "__probe_nargs" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" @@ -52,56 +53,102 @@ const char *reserved_field_names[] = { "common_tgid", "common_lock_depth", FIELD_STRING_IP, - FIELD_STRING_NARGS, FIELD_STRING_RETIP, FIELD_STRING_FUNC, }; -struct fetch_func { - unsigned long (*func)(struct pt_regs *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ +static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, void *data)\ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ +} \ +static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); + +struct fetch_param { + fetch_func_t fn; void *data; }; -static __kprobes unsigned long call_fetch(struct fetch_func *f, - struct pt_regs *regs) +static __kprobes void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) { - return f->func(regs, f->data); + return fprm->fn(regs, fprm->data, dest); } -/* fetch handlers */ -static __kprobes unsigned long fetch_register(struct pt_regs *regs, - void *offset) -{ - return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(kind) \ +DEFINE_FETCH_##kind(u8) \ +DEFINE_FETCH_##kind(u16) \ +DEFINE_FETCH_##kind(u32) \ +DEFINE_FETCH_##kind(u64) + +#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ + ((FETCH_FUNC_NAME(kind, u8) == fn) || \ + (FETCH_FUNC_NAME(kind, u16) == fn) || \ + (FETCH_FUNC_NAME(kind, u32) == fn) || \ + (FETCH_FUNC_NAME(kind, u64) == fn)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ } - -static __kprobes unsigned long fetch_stack(struct pt_regs *regs, - void *num) -{ - return regs_get_kernel_stack_nth(regs, - (unsigned int)((unsigned long)num)); +DEFINE_BASIC_FETCH_FUNCS(reg) + +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ } +DEFINE_BASIC_FETCH_FUNCS(stack) -static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) -{ - unsigned long retval; - - if (probe_kernel_address(addr, retval)) - return 0; - return retval; +#define DEFINE_FETCH_retval(type) \ +static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ } - -static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, - void *dummy) -{ - return regs_return_value(regs); -} - -static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, - void *dummy) -{ - return kernel_stack_pointer(regs); +DEFINE_BASIC_FETCH_FUNCS(retval) + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ } +DEFINE_BASIC_FETCH_FUNCS(memory) /* Memory fetching by symbol */ struct symbol_cache { @@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) return sc; } -static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) -{ - struct symbol_cache *sc = data; - - if (sc->addr) - return fetch_memory(regs, (void *)sc->addr); - else - return 0; +#define DEFINE_FETCH_symbol(type) \ +static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(symbol) -/* Special indirect memory access interface */ -struct indirect_fetch_data { - struct fetch_func orig; +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; long offset; }; -static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) -{ - struct indirect_fetch_data *ind = data; - unsigned long addr; - - addr = call_fetch(&ind->orig, regs); - if (addr) { - addr += ind->offset; - return fetch_memory(regs, (void *)addr); - } else - return 0; +#define DEFINE_FETCH_deref(type) \ +static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + fetch_memory_##type(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(deref) -static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (data->orig.func == fetch_indirect) - free_indirect_fetch_data(data->orig.data); - else if (data->orig.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(kind, type) \ + .kind = FETCH_FUNC_NAME(kind, type) + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + {.name = #ptype, \ + .size = sizeof(ftype), \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } + +/* Fetch type information table */ +static const struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + /* Fetch functions */ + fetch_func_t reg; + fetch_func_t stack; + fetch_func_t retval; + fetch_func_t memory; + fetch_func_t symbol; + fetch_func_t deref; +} fetch_type_table[] = { + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), +}; + +static const struct fetch_type *find_fetch_type(const char *type) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) + if (strcmp(type, fetch_type_table[i].name) == 0) + return &fetch_type_table[i]; + return NULL; +} + +/* Special function : only accept unsigned long */ +static __kprobes void fetch_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} + /** * Kprobe event core functions */ struct probe_arg { - struct fetch_func fetch; - const char *name; + struct fetch_param fetch; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ }; /* Flags for trace_probe */ @@ -204,6 +326,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_call call; struct trace_event event; + ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; }; @@ -212,6 +335,7 @@ struct trace_probe { (offsetof(struct trace_probe, args) + \ (sizeof(struct probe_arg) * (n))) + static __kprobes int probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; @@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } -static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) -{ - int ret = -EINVAL; - - if (ff->func == fetch_register) { - const char *name; - name = regs_query_register_name((unsigned int)((long)ff->data)); - ret = snprintf(buf, n, "%%%s", name); - } else if (ff->func == fetch_stack) - ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); - else if (ff->func == fetch_memory) - ret = snprintf(buf, n, "@0x%p", ff->data); - else if (ff->func == fetch_symbol) { - struct symbol_cache *sc = ff->data; - if (sc->offset) - ret = snprintf(buf, n, "@%s%+ld", sc->symbol, - sc->offset); - else - ret = snprintf(buf, n, "@%s", sc->symbol); - } else if (ff->func == fetch_retvalue) - ret = snprintf(buf, n, "$retval"); - else if (ff->func == fetch_stack_address) - ret = snprintf(buf, n, "$stack"); - else if (ff->func == fetch_indirect) { - struct indirect_fetch_data *id = ff->data; - size_t l = 0; - ret = snprintf(buf, n, "%+ld(", id->offset); - if (ret >= n) - goto end; - l += ret; - ret = probe_arg_string(buf + l, n - l, &id->orig); - if (ret < 0) - goto end; - l += ret; - ret = snprintf(buf + l, n - l, ")"); - ret += l; - } -end: - if (ret >= n) - return -ENOSPC; - return ret; -} - static int register_probe_event(struct trace_probe *tp); static void unregister_probe_event(struct trace_probe *tp); @@ -347,11 +428,12 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (arg->fetch.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); - else if (arg->fetch.func == fetch_indirect) - free_indirect_fetch_data(arg->fetch.data); kfree(arg->name); + kfree(arg->comm); } static void free_trace_probe(struct trace_probe *tp) @@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_ARGS 16 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) -static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) { - ff->func = fetch_retvalue; - ff->data = NULL; - } else + if (is_return) + f->fn = t->retval; + else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - ff->func = fetch_stack_address; - ff->data = NULL; + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) + f->fn = fetch_stack_address; + else + ret = -EINVAL; } else if (isdigit(arg[5])) { ret = strict_strtoul(arg + 5, 10, ¶m); if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - ff->func = fetch_stack; - ff->data = (void *)param; + f->fn = t->stack; + f->data = (void *)param; } } else ret = -EINVAL; @@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) } /* Recursive argument parser */ -static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int __parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; @@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, ff, is_return); + ret = parse_probe_vars(arg + 1, t, f, is_return); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - ff->func = fetch_register; - ff->data = (void *)(unsigned long)ret; + f->fn = t->reg; + f->data = (void *)(unsigned long)ret; ret = 0; } break; @@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - ff->func = fetch_memory; - ff->data = (void *)param; + f->fn = t->memory; + f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); if (ret) break; - ff->data = alloc_symbol_cache(arg + 1, offset); - if (ff->data) - ff->func = fetch_symbol; - else - ret = -EINVAL; + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->symbol; } break; - case '+': /* indirect memory */ + case '+': /* deref memory */ case '-': tmp = strchr(arg, '('); - if (!tmp) { - ret = -EINVAL; + if (!tmp) break; - } *tmp = '\0'; ret = strict_strtol(arg + 1, 0, &offset); if (ret) @@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { - struct indirect_fetch_data *id; + struct deref_fetch_param *dprm; + const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - id = kzalloc(sizeof(struct indirect_fetch_data), - GFP_KERNEL); - if (!id) + dprm = kzalloc(sizeof(struct deref_fetch_param), + GFP_KERNEL); + if (!dprm) return -ENOMEM; - id->offset = offset; - ret = __parse_probe_arg(arg, &id->orig, is_return); + dprm->offset = offset; + ret = __parse_probe_arg(arg, t2, &dprm->orig, + is_return); if (ret) - kfree(id); + kfree(dprm); else { - ff->func = fetch_indirect; - ff->data = (void *)id; + f->fn = t->deref; + f->data = (void *)dprm; } - } else - ret = -EINVAL; + } break; - default: - /* TODO: support custom handler */ - ret = -EINVAL; } + if (!ret && !f->fn) + ret = -EINVAL; return ret; } /* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_arg(char *arg, struct trace_probe *tp, + struct probe_arg *parg, int is_return) { + const char *t; + if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; } - return __parse_probe_arg(arg, ff, is_return); + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = tp->size; + tp->size += parg->type->size; + return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); } /* Return 1 if name is reserved or already used by another argument */ @@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv) * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG - * Indirect memory fetch: + * Dereferencing memory fetch: * +|-offs(ARG) : fetch memory at ARG +|- offs address. * Alias name of args: * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_probe *tp; int i, ret = 0; int is_return = 0, is_delete = 0; - char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + char *symbol = NULL, *event = NULL, *group = NULL; + char *arg, *tmp; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv) else arg = argv[i]; - if (conflict_field_name(argv[i], tp->args, i)) { - pr_info("Argument%d name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); if (!tp->args[i].name) { pr_info("Failed to allocate argument%d name '%s'.\n", @@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv) ret = -ENOMEM; goto error; } + tmp = strchr(tp->args[i].name, ':'); + if (tmp) + *tmp = '_'; /* convert : to _ */ + + if (conflict_field_name(tp->args[i].name, tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } /* Parse fetch argument */ - ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { pr_info("Parse error at argument%d. (%d)\n", i, ret); kfree(tp->args[i].name); @@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { struct trace_probe *tp = v; - int i, ret; - char buf[MAX_ARGSTR_LEN + 1]; + int i; seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); @@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v) else seq_printf(m, " %s", probe_symbol(tp)); - for (i = 0; i < tp->nr_args; i++) { - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); - if (ret < 0) { - pr_warning("Argument%d decoding error(%d).\n", i, ret); - return ret; - } - seq_printf(m, " %s=%s", tp->args[i].name, buf); - } + for (i = 0; i < tp->nr_args; i++) + seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); seq_printf(m, "\n"); + return 0; } @@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = { static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; event = trace_current_buffer_lock_reserve(&buffer, call->id, size, irq_flags, pc); @@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; event = trace_current_buffer_lock_reserve(&buffer, call->id, size, irq_flags, pc); @@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags) { - struct kprobe_trace_entry *field; + struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_event *event; struct trace_probe *tp; + u8 *data; int i; - field = (struct kprobe_trace_entry *)iter->ent; + field = (struct kprobe_trace_entry_head *)iter->ent; event = ftrace_find_event(field->ent.type); tp = container_of(event, struct trace_probe, event); @@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1046,13 +1151,14 @@ partial: enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags) { - struct kretprobe_trace_entry *field; + struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_event *event; struct trace_probe *tp; + u8 *data; int i; - field = (struct kretprobe_trace_entry *)iter->ent; + field = (struct kretprobe_trace_entry_head *)iter->ent; event = ftrace_find_event(field->ent.type); tp = container_of(event, struct trace_probe, event); @@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call) static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kprobe_trace_entry field; + struct kprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kretprobe_trace_entry field; + struct kretprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } @@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", - tp->args[i].name); + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); @@ -1219,12 +1340,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; + u8 *data; int size, __size, i; unsigned long irq_flags; int rctx; - __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1235,10 +1357,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, if (!entry) return; - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); } @@ -1249,12 +1371,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; + u8 *data; int size, __size, i; unsigned long irq_flags; int rctx; - __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1265,11 +1388,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, if (!entry) return; - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags, regs); -- cgit v1.2.3 From 95476b64ab11d528de2557366ec584977c215b9e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Apr 2010 23:42:18 +0200 Subject: perf: Fix hlist related build error hlist helpers need to be available for all software events, not only trace events. Pull them out outside the ifdef CONFIG_EVENT_TRACING section. Fixes: kernel/perf_event.c:4573: error: implicit declaration of function 'swevent_hlist_put' kernel/perf_event.c:4614: error: implicit declaration of function 'swevent_hlist_get' kernel/perf_event.c:5534: error: implicit declaration of function 'swevent_hlist_release Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1271281338-23491-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 60 ++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 095101d685bc..07b7a435bf03 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4313,36 +4313,6 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_TRACING - -void perf_tp_event(int event_id, u64 addr, u64 count, void *record, - int entry_size, struct pt_regs *regs) -{ - struct perf_sample_data data; - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - perf_sample_data_init(&data, addr); - data.raw = &raw; - - /* Trace events already protected against recursion */ - do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); -} -EXPORT_SYMBOL_GPL(perf_tp_event); - -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data) -{ - void *record = data->raw->data; - - if (likely(!event->filter) || filter_match_preds(event->filter, record)) - return 1; - return 0; -} - static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) { struct swevent_hlist *hlist; @@ -4442,6 +4412,36 @@ static int swevent_hlist_get(struct perf_event *event) return err; } +#ifdef CONFIG_EVENT_TRACING + +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size, struct pt_regs *regs) +{ + struct perf_sample_data data; + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + perf_sample_data_init(&data, addr); + data.raw = &raw; + + /* Trace events already protected against recursion */ + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_disable(event->attr.config); -- cgit v1.2.3 From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Mon, 19 Apr 2010 13:32:41 +0800 Subject: perf: Enhance perf to allow for guest statistic collection from host Below patch introduces perf_guest_info_callbacks and related register/unregister functions. Add more PERF_RECORD_MISC_XXX bits meaning guest kernel and guest user space. Signed-off-by: Zhang Yanmin Signed-off-by: Avi Kivity --- kernel/perf_event.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 07b7a435bf03..9dbe8cdaf145 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2797,6 +2797,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski } +/* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + /* * Output */ @@ -3749,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) .event_id = { .header = { .type = PERF_RECORD_MMAP, - .misc = 0, + .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ -- cgit v1.2.3 From 4b402210486c6414fe5fbfd85934a0a22da56b04 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 16 Apr 2010 23:20:00 +0200 Subject: mutex: Don't spin when the owner CPU is offline or other weird cases Due to recent load-balancer changes that delay the task migration to the next wakeup, the adaptive mutex spinning ends up in a live lock when the owner's CPU gets offlined because the cpu_online() check lives before the owner running check. This patch changes mutex_spin_on_owner() to return 0 (don't spin) in any case where we aren't sure about the owner struct validity or CPU number, and if the said CPU is offline. There is no point going back & re-evaluate spinning in corner cases like that, let's just go to sleep. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Peter Zijlstra LKML-Reference: <1271212509.13059.135.camel@pasglop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6af210a7de70..de0bd26e520a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3780,7 +3780,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - goto out; + return 0; #else cpu = owner->cpu; #endif @@ -3790,14 +3790,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - goto out; + return 0; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - goto out; + return 0; rq = cpu_rq(cpu); @@ -3816,7 +3816,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) cpu_relax(); } -out: + return 1; } #endif -- cgit v1.2.3 From 47dd5be2d6a82b8153e059a1d09eb3879d485bfd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 30 Apr 2010 07:23:51 +0200 Subject: workqueue: flush_delayed_work: keep the original workqueue for re-queueing flush_delayed_work() always uses keventd_wq for re-queueing, but it should use the workqueue this dwork was queued on. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dee48658805c..5bfb213984b2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(keventd_wq, get_cpu()); + cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); __queue_work(cwq, &dwork->work); put_cpu(); } -- cgit v1.2.3 From 8b08ca52f5942c21564bbb90ccfb61053f2c26a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Apr 2010 13:02:07 -0700 Subject: rcu: Fix RCU lockdep splat in set_task_cpu on fork path Add an RCU read-side critical section to suppress this false positive. Located-by: Eric Paris Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: eric.dumazet@gmail.com LKML-Reference: <1271880131-3951-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index de0bd26e520a..3c2a54f70ffe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -323,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { + /* + * Strictly speaking this rcu_read_lock() is not needed since the + * task_group is tied to the cgroup, which in turn can never go away + * as long as there are tasks attached to it. + * + * However since task_group() uses task_subsys_state() which is an + * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. + */ + rcu_read_lock(); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; @@ -332,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; #endif + rcu_read_unlock(); } #else -- cgit v1.2.3 From 8b46f880841aac821af8efa6581bb0e46b8b9845 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Apr 2010 13:02:08 -0700 Subject: rcu: Fix RCU lockdep splat on freezer_fork path Add an RCU read-side critical section to suppress this false positive. Located-by: Eric Paris Signed-off-by: Paul E. McKenney Acked-by: Li Zefan Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: eric.dumazet@gmail.com LKML-Reference: <1271880131-3951-2-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/cgroup_freezer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index da5e13975531..e5c0244962b0 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -205,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) * No lock is needed, since the task isn't on tasklist yet, * so it can't be moved to another cgroup, which means the * freezer won't be removed and will be valid during this - * function call. + * function call. Nevertheless, apply RCU read-side critical + * section to suppress RCU lockdep false positives. */ + rcu_read_lock(); freezer = task_freezer(task); + rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the -- cgit v1.2.3 From 87e9b2024659c614a876ce359a57e98a47b5ef37 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 17 Apr 2010 18:11:59 +0200 Subject: hw-breakpoints: Check disabled breakpoints again We stopped checking disabled breakpoints because we weren't allowing breakpoints on NULL addresses. And gdb tends to set NULL addresses on inactive breakpoints. But refusing NULL addresses was actually a regression that has been fixed now. There is no reason anymore to not validate inactive breakpoint settings. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jason Wessel Cc: Ingo Molnar --- kernel/hw_breakpoint.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 03808ed342a6..9ed9ae3a48b3 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -316,17 +316,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (ret) return ret; - /* - * Ptrace breakpoints can be temporary perf events only - * meant to reserve a slot. In this case, it is created disabled and - * we don't want to check the params right now (as we put a null addr) - * But perf tools create events as disabled and we want to check - * the params for them. - * This is a quick hack that will be removed soon, once we remove - * the tmp breakpoints from ptrace - */ - if (!bp->attr.disabled || !bp->overflow_handler) - ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); /* if arch_validate_hwbkpt_settings() fails then release bp slot */ if (ret) -- cgit v1.2.3 From b2812d031dea86926e9c10f7714af33ac2f6b43d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 18 Apr 2010 18:11:53 +0200 Subject: hw-breakpoints: Change/Enforce some breakpoints policies The current policies of breakpoints in x86 and SH are the following: - task bound breakpoints can only break on userspace addresses - cpu wide breakpoints can only break on kernel addresses The former rule prevents ptrace breakpoints to be set to trigger on kernel addresses, which is good. But as a side effect, we can't breakpoint on kernel addresses for task bound breakpoints. The latter rule simply makes no sense, there is no reason why we can't set breakpoints on userspace while performing cpu bound profiles. We want the following new policies: - task bound breakpoint can set userspace address breakpoints, with no particular privilege required. - task bound breakpoints can set kernelspace address breakpoints but must be privileged to do that. - cpu bound breakpoints can do what they want as they are privileged already. To implement these new policies, this patch checks if we are dealing with a kernel address breakpoint, if so and if the exclude_kernel parameter is set, we tell the user that the breakpoint is invalid, which makes a good generic ptrace protection. If we don't have exclude_kernel, ensure the user has the right privileges as kernel breakpoints are quite sensitive (risk of trap recursion attacks and global performance impacts). [ Paul Mundt: keep addr space check for sh signal delivery and fix double function declaration] Signed-off-by: Frederic Weisbecker Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Paul Mundt Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jason Wessel Cc: Ingo Molnar Signed-off-by: Paul Mundt --- kernel/hw_breakpoint.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 9ed9ae3a48b3..89e8a050c43a 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -308,6 +308,28 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } +static int validate_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = arch_validate_hwbkpt_settings(bp); + if (ret) + return ret; + + if (arch_check_bp_in_kernelspace(bp)) { + if (bp->attr.exclude_kernel) + return -EINVAL; + /* + * Don't let unprivileged users set a breakpoint in the trap + * path to avoid trap recursion attacks. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return 0; +} + int register_perf_hw_breakpoint(struct perf_event *bp) { int ret; @@ -316,7 +338,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (ret) return ret; - ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + ret = validate_hw_breakpoint(bp); /* if arch_validate_hwbkpt_settings() fails then release bp slot */ if (ret) @@ -363,7 +385,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att if (attr->disabled) goto end; - err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + err = validate_hw_breakpoint(bp); if (!err) perf_event_enable(bp); -- cgit v1.2.3 From 0102752e4c9e0655b39734550d4c35327954f7f9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 11 Apr 2010 18:55:56 +0200 Subject: hw-breakpoints: Separate constraint space for data and instruction breakpoints There are two outstanding fashions for archs to implement hardware breakpoints. The first is to separate breakpoint address pattern definition space between data and instruction breakpoints. We then have typically distinct instruction address breakpoint registers and data address breakpoint registers, delivered with separate control registers for data and instruction breakpoints as well. This is the case of PowerPc and ARM for example. The second consists in having merged breakpoint address space definition between data and instruction breakpoint. Address registers can host either instruction or data address and the access mode for the breakpoint is defined in a control register. This is the case of x86 and Super H. This patch adds a new CONFIG_HAVE_MIXED_BREAKPOINTS_REGS config that archs can select if they belong to the second case. Those will have their slot allocation merged for instructions and data breakpoints. The others will have a separate slot tracking between data and instruction breakpoints. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar --- kernel/hw_breakpoint.c | 86 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 89e8a050c43a..8ead1345e33b 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -45,18 +45,28 @@ #include +enum bp_type_idx { + TYPE_INST = 0, +#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS + TYPE_DATA = 0, +#else + TYPE_DATA = 1, +#endif + TYPE_MAX +}; + /* * Constraints data */ /* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); +static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[TYPE_MAX][HBP_NUM]); /* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); /* Gather the number of total pinned and un-pinned bp in a cpuset */ struct bp_busy_slots { @@ -67,14 +77,22 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +{ + if (bp->attr.bp_type & HW_BREAKPOINT_RW) + return TYPE_DATA; + + return TYPE_INST; +} + /* * Report the maximum number of pinned breakpoints a task * have in this cpu */ -static unsigned int max_task_bp_pinned(int cpu) +static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); for (i = HBP_NUM -1; i >= 0; i--) { if (tsk_pinned[i] > 0) @@ -84,7 +102,7 @@ static unsigned int max_task_bp_pinned(int cpu) return 0; } -static int task_bp_pinned(struct task_struct *tsk) +static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) { struct perf_event_context *ctx = tsk->perf_event_ctxp; struct list_head *list; @@ -105,7 +123,8 @@ static int task_bp_pinned(struct task_struct *tsk) */ list_for_each_entry(bp, list, event_entry) { if (bp->attr.type == PERF_TYPE_BREAKPOINT) - count++; + if (find_slot_idx(bp) == type) + count++; } raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -118,18 +137,19 @@ static int task_bp_pinned(struct task_struct *tsk) * a given cpu (cpu > -1) or in all of them (cpu = -1). */ static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, + enum bp_type_idx type) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) - slots->pinned += max_task_bp_pinned(cpu); + slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(tsk); - slots->flexible = per_cpu(nr_bp_flexible, cpu); + slots->pinned += task_bp_pinned(tsk, type); + slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; } @@ -137,16 +157,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) for_each_online_cpu(cpu) { unsigned int nr; - nr = per_cpu(nr_cpu_bp_pinned, cpu); + nr = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) - nr += max_task_bp_pinned(cpu); + nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(tsk); + nr += task_bp_pinned(tsk, type); if (nr > slots->pinned) slots->pinned = nr; - nr = per_cpu(nr_bp_flexible, cpu); + nr = per_cpu(nr_bp_flexible[type], cpu); if (nr > slots->flexible) slots->flexible = nr; @@ -156,14 +176,15 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) /* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, + enum bp_type_idx type) { unsigned int *tsk_pinned; int count = 0; - count = task_bp_pinned(tsk); + count = task_bp_pinned(tsk, type); - tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { tsk_pinned[count]++; if (count > 0) @@ -178,7 +199,8 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) /* * Add/remove the given breakpoint in our constraint table */ -static void toggle_bp_slot(struct perf_event *bp, bool enable) +static void +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; @@ -186,20 +208,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) /* Pinned counter task profiling */ if (tsk) { if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable); + toggle_bp_task_slot(tsk, cpu, enable, type); return; } for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable); + toggle_bp_task_slot(tsk, cpu, enable, type); return; } /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu)++; else - per_cpu(nr_cpu_bp_pinned, bp->cpu)--; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu)--; } /* @@ -246,14 +268,21 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; + enum bp_type_idx type; - fetch_bp_busy_slots(&slots, bp); + /* Basic checks */ + if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || + bp->attr.bp_type == HW_BREAKPOINT_INVALID) + return -EINVAL; + + type = find_slot_idx(bp); + fetch_bp_busy_slots(&slots, bp, type); /* Flexible counters need to keep at least one slot */ if (slots.pinned + (!!slots.flexible) == HBP_NUM) return -ENOSPC; - toggle_bp_slot(bp, true); + toggle_bp_slot(bp, true, type); return 0; } @@ -273,7 +302,10 @@ int reserve_bp_slot(struct perf_event *bp) static void __release_bp_slot(struct perf_event *bp) { - toggle_bp_slot(bp, false); + enum bp_type_idx type; + + type = find_slot_idx(bp); + toggle_bp_slot(bp, false, type); } void release_bp_slot(struct perf_event *bp) -- cgit v1.2.3 From f93a20541134fa767e8dc4eb32e956d30b9f6b92 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 13 Apr 2010 00:32:30 +0200 Subject: hw-breakpoints: Handle breakpoint weight in allocation constraints Depending on their nature and on what an arch supports, breakpoints may consume more than one address register. For example a simple absolute address match usually only requires one address register. But an address range match may consume two registers. Currently our slot allocation constraints, that tend to reflect the limited arch's resources, always consider that a breakpoint consumes one slot. Then provide a way for archs to tell us the weight of a breakpoint through a new hw_breakpoint_weight() helper. This weight will be computed against the generic allocation constraints instead of a constant value. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar --- kernel/hw_breakpoint.c | 63 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 8ead1345e33b..974498b858fc 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -77,6 +77,11 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +__weak int hw_breakpoint_weight(struct perf_event *bp) +{ + return 1; +} + static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) { if (bp->attr.bp_type & HW_BREAKPOINT_RW) @@ -124,7 +129,7 @@ static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) list_for_each_entry(bp, list, event_entry) { if (bp->attr.type == PERF_TYPE_BREAKPOINT) if (find_slot_idx(bp) == type) - count++; + count += hw_breakpoint_weight(bp); } raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -173,26 +178,41 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, } } +/* + * For now, continue to consider flexible as pinned, until we can + * ensure no flexible event can ever be scheduled before a pinned event + * in a same cpu. + */ +static void +fetch_this_slot(struct bp_busy_slots *slots, int weight) +{ + slots->pinned += weight; +} + /* * Add a pinned breakpoint for the given task in our constraint table */ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, - enum bp_type_idx type) + enum bp_type_idx type, int weight) { unsigned int *tsk_pinned; - int count = 0; + int old_count = 0; + int old_idx = 0; + int idx = 0; - count = task_bp_pinned(tsk, type); + old_count = task_bp_pinned(tsk, type); + old_idx = old_count - 1; + idx = old_idx + weight; tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { - tsk_pinned[count]++; - if (count > 0) - tsk_pinned[count-1]--; + tsk_pinned[idx]++; + if (old_count > 0) + tsk_pinned[old_idx]--; } else { - tsk_pinned[count]--; - if (count > 0) - tsk_pinned[count-1]++; + tsk_pinned[idx]--; + if (old_count > 0) + tsk_pinned[old_idx]++; } } @@ -200,7 +220,8 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, * Add/remove the given breakpoint in our constraint table */ static void -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type) +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, + int weight) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; @@ -208,20 +229,20 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type) /* Pinned counter task profiling */ if (tsk) { if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable, type); + toggle_bp_task_slot(tsk, cpu, enable, type, weight); return; } for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable, type); + toggle_bp_task_slot(tsk, cpu, enable, type, weight); return; } /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu)++; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu)--; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; } /* @@ -269,6 +290,7 @@ static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; enum bp_type_idx type; + int weight; /* Basic checks */ if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || @@ -276,13 +298,16 @@ static int __reserve_bp_slot(struct perf_event *bp) return -EINVAL; type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + fetch_bp_busy_slots(&slots, bp, type); + fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) + if (slots.pinned + (!!slots.flexible) > HBP_NUM) return -ENOSPC; - toggle_bp_slot(bp, true, type); + toggle_bp_slot(bp, true, type, weight); return 0; } @@ -303,9 +328,11 @@ int reserve_bp_slot(struct perf_event *bp) static void __release_bp_slot(struct perf_event *bp) { enum bp_type_idx type; + int weight; type = find_slot_idx(bp); - toggle_bp_slot(bp, false, type); + weight = hw_breakpoint_weight(bp); + toggle_bp_slot(bp, false, type, weight); } void release_bp_slot(struct perf_event *bp) -- cgit v1.2.3 From feef47d0cb530e8419dfa0b48141b538b89b1b1a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 23 Apr 2010 05:59:55 +0200 Subject: hw-breakpoints: Get the number of available registers on boot dynamically The breakpoint generic layer assumes that archs always know in advance the static number of address registers available to host breakpoints through the HBP_NUM macro. However this is not true for every archs. For example Arm needs to get this information dynamically to handle the compatiblity between different versions. To solve this, this patch proposes to drop the static HBP_NUM macro and let the arch provide the number of available slots through a new hw_breakpoint_slots() function. For archs that have CONFIG_HAVE_MIXED_BREAKPOINTS_REGS selected, it will be called once as the number of registers fits for instruction and data breakpoints together. For the others it will be called first to get the number of instruction breakpoint registers and another time to get the data breakpoint registers, the targeted type is given as a parameter of hw_breakpoint_slots(). Reported-by: Will Deacon Signed-off-by: Frederic Weisbecker Acked-by: Paul Mundt Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jason Wessel Cc: Ingo Molnar --- kernel/hw_breakpoint.c | 53 ++++++++++++++++++++++++++++++++++++----------- kernel/trace/trace_ksym.c | 26 +++++++---------------- 2 files changed, 48 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 974498b858fc..684b710cbb91 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -40,20 +40,12 @@ #include #include #include +#include #include #include #include -enum bp_type_idx { - TYPE_INST = 0, -#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS - TYPE_DATA = 0, -#else - TYPE_DATA = 1, -#endif - TYPE_MAX -}; /* * Constraints data @@ -63,11 +55,15 @@ enum bp_type_idx { static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[TYPE_MAX][HBP_NUM]); +static DEFINE_PER_CPU(unsigned int, *nr_task_bp_pinned[TYPE_MAX]); /* Number of non-pinned cpu/task breakpoints in a cpu */ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); +static int nr_slots[TYPE_MAX]; + +static int constraints_initialized; + /* Gather the number of total pinned and un-pinned bp in a cpuset */ struct bp_busy_slots { unsigned int pinned; @@ -99,7 +95,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) int i; unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - for (i = HBP_NUM -1; i >= 0; i--) { + for (i = nr_slots[type] - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } @@ -292,6 +288,10 @@ static int __reserve_bp_slot(struct perf_event *bp) enum bp_type_idx type; int weight; + /* We couldn't initialize breakpoint constraints on boot */ + if (!constraints_initialized) + return -ENOMEM; + /* Basic checks */ if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || bp->attr.bp_type == HW_BREAKPOINT_INVALID) @@ -304,7 +304,7 @@ static int __reserve_bp_slot(struct perf_event *bp) fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > HBP_NUM) + if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; toggle_bp_slot(bp, true, type, weight); @@ -551,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { static int __init init_hw_breakpoint(void) { + unsigned int **task_bp_pinned; + int cpu, err_cpu; + int i; + + for (i = 0; i < TYPE_MAX; i++) + nr_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); + *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], + GFP_KERNEL); + if (!*task_bp_pinned) + goto err_alloc; + } + } + + constraints_initialized = 1; + return register_die_notifier(&hw_breakpoint_exceptions_nb); + + err_alloc: + for_each_possible_cpu(err_cpu) { + if (err_cpu == cpu) + break; + for (i = 0; i < TYPE_MAX; i++) + kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + } + + return -ENOMEM; } core_initcall(init_hw_breakpoint); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index d59cd6879477..8eaf00749b65 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -34,12 +34,6 @@ #include -/* - * For now, let us restrict the no. of symbols traced simultaneously to number - * of available hardware breakpoint registers. - */ -#define KSYM_TRACER_MAX HBP_NUM - #define KSYM_TRACER_OP_LEN 3 /* rw- */ struct trace_ksym { @@ -53,7 +47,6 @@ struct trace_ksym { static struct trace_array *ksym_trace_array; -static unsigned int ksym_filter_entry_count; static unsigned int ksym_tracing_enabled; static HLIST_HEAD(ksym_filter_head); @@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) struct trace_ksym *entry; int ret = -ENOMEM; - if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { - printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" - " new requests for tracing can be accepted now.\n", - KSYM_TRACER_MAX); - return -ENOSPC; - } - entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); if (!entry) return -ENOMEM; @@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) if (IS_ERR(entry->ksym_hbp)) { ret = PTR_ERR(entry->ksym_hbp); - printk(KERN_INFO "ksym_tracer request failed. Try again" - " later!!\n"); + if (ret == -ENOSPC) { + printk(KERN_ERR "ksym_tracer: Maximum limit reached." + " No new requests for tracing can be accepted now.\n"); + } else { + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + } goto err; } hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); - ksym_filter_entry_count++; return 0; @@ -265,7 +255,6 @@ static void __ksym_trace_reset(void) hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, ksym_hlist) { unregister_wide_hw_breakpoint(entry->ksym_hbp); - ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry); @@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file, goto out_unlock; } /* Error or "symbol:---" case: drop it */ - ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry); -- cgit v1.2.3 From 048c852051d2bd5da54a4488bc1f16b0fc74c695 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 1 May 2010 10:11:35 +0200 Subject: perf: Fix resource leak in failure path of perf_event_open() perf_event_open() kfrees event after init failure which doesn't release all resources allocated by perf_event_alloc(). Use free_event() instead. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: LKML-Reference: <4BDBE237.1040809@kernel.org> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2f3fbf84215a..3d1552d3c12b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4897,7 +4897,7 @@ err_fput_free_put_context: err_free_put_context: if (err < 0) - kfree(event); + free_event(event); err_put_context: if (err < 0) -- cgit v1.2.3 From 777d0411cd1e384115985dac5ccd42031e3eee2b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 3 May 2010 15:39:45 +0200 Subject: hw_breakpoints: Fix percpu build failure Fix this build error: kernel/hw_breakpoint.c:58:1: error: pasting "__pcpu_scope_" and "*" does not give a valid preprocessing token It happens if CONFIG_DEBUG_FORCE_WEAK_PER_CPU, because we concatenate someting with the name and we have the "*" in the name. Cc: Frederic Weisbecker Cc: Will Deacon Cc: Paul Mundt Cc: Mahesh Salgaonkar Cc: K. Prasad Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jason Wessel LKML-Reference: <20100503133942.GA5497@nowhere> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 684b710cbb91..7a56b22e0602 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -55,7 +55,7 @@ static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, *nr_task_bp_pinned[TYPE_MAX]); +static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); /* Number of non-pinned cpu/task breakpoints in a cpu */ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); -- cgit v1.2.3 From d9f599e1e6d019968b35d2dc63074b9e8964fa69 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 20 Mar 2010 17:39:11 +0300 Subject: perf: Fix check at end of event search The original code doesn't work because "call" is never NULL there. Signed-off-by: Dan Carpenter LKML-Reference: <20100320143911.GF5331@bicker> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 88c0b6dbd7fe..58092d844a1f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1398,7 +1398,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, } err = -EINVAL; - if (!call) + if (&call->list == &ftrace_events) goto out_unlock; err = -EEXIST; -- cgit v1.2.3 From 4fd38e4595e2f6c9d27732c042a0e16b2753049c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2010 17:31:38 +0200 Subject: perf: Fix exit() vs PERF_FORMAT_GROUP Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't work as expected if the task the counters were attached to quit before the read() call. The cause is that we unconditionally destroy the grouping when we remove counters from their context. Fix this by only doing this when we free the counter itself. Reported-by: Corey Ashford Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1273160566.5605.404.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3d1552d3c12b..f13c3db765f4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -341,6 +341,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; + if (event->state > PERF_EVENT_STATE_FREE) + return; + /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -1856,6 +1859,8 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + event->state = PERF_EVENT_STATE_FREE; + WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_event_remove_from_context(event); -- cgit v1.2.3 From a0507c84bf47dfd204299774f45fd16da33f0619 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2010 15:42:53 +0200 Subject: perf: Annotate perf_event_read_group() vs perf_event_release_kernel() Stephane reported a lockdep warning while using PERF_FORMAT_GROUP. The issue is that perf_event_read_group() takes faults while holding the ctx->mutex, while perf_event_release_kernel() can be called from munmap(). Which makes for an AB-BA deadlock. Except we can never establish the deadlock because we'll only ever call perf_event_release_kernel() after all file descriptors are dead so there is no concurrency possible. Reported-by: Stephane Eranian Cc: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 49d8be5a45e3..34659d4085c7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1867,7 +1867,19 @@ int perf_event_release_kernel(struct perf_event *event) event->state = PERF_EVENT_STATE_FREE; WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); perf_event_remove_from_context(event); mutex_unlock(&ctx->mutex); @@ -5305,7 +5317,7 @@ void perf_event_exit_task(struct task_struct *child) * * But since its the parent context it won't be the same instance. */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&child_ctx->mutex); again: list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, -- cgit v1.2.3 From 6bde9b6ce0127e2a56228a2071536d422be31336 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 23 Apr 2010 13:56:00 +0800 Subject: perf: Add group scheduling transactional APIs Add group scheduling transactional APIs to struct pmu. These APIs will be implemented in arch code, based on Peter's idea as below. > the idea behind hw_perf_group_sched_in() is to not perform > schedulability tests on each event in the group, but to add the group > as a whole and then perform one test. > > Of course, when that test fails, you'll have to roll-back the whole > group again. > > So start_txn (or a better name) would simply toggle a flag in the pmu > implementation that will make pmu::enable() not perform the > schedulablilty test. > > Then commit_txn() will perform the schedulability test (so note the > method has to have a !void return value. > > This will allow us to use the regular > kernel/perf_event.c::group_sched_in() and all the rollback code. > Currently each hw_perf_group_sched_in() implementation duplicates all > the rolllback code (with various bugs). ->start_txn: Start group events scheduling transaction, set a flag to make pmu::enable() not perform the schedulability test, it will be performed at commit time. ->commit_txn: Commit group events scheduling transaction, perform the group schedulability as a whole ->cancel_txn: Stop group events scheduling transaction, clear the flag so pmu::enable() will perform the schedulability test. Reviewed-by: Stephane Eranian Reviewed-by: Frederic Weisbecker Signed-off-by: Lin Ming Cc: David Miller Cc: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1272002160.5707.60.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 34659d4085c7..bb06382f98e7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -83,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -int __weak -hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - return 0; -} - void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); @@ -644,15 +636,20 @@ group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - struct perf_event *event, *partial_group; + struct perf_event *event, *partial_group = NULL; + const struct pmu *pmu = group_event->pmu; + bool txn = false; int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); - if (ret) - return ret < 0 ? ret : 0; + /* Check if group transaction availabe */ + if (pmu->start_txn) + txn = true; + + if (txn) + pmu->start_txn(pmu); if (event_sched_in(group_event, cpuctx, ctx)) return -EAGAIN; @@ -667,9 +664,19 @@ group_sched_in(struct perf_event *group_event, } } - return 0; + if (txn) { + ret = pmu->commit_txn(pmu); + if (!ret) { + pmu->cancel_txn(pmu); + + return 0; + } + } group_error: + if (txn) + pmu->cancel_txn(pmu); + /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: -- cgit v1.2.3 From 6e85158cf5a2385264316870256fb6ad681156a0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 8 May 2010 20:58:00 +1000 Subject: perf_event: Make software events work again Commit 6bde9b6ce0127e2a56228a2071536d422be31336 ("perf: Add group scheduling transactional APIs") added code to allow a group to be scheduled in a single transaction. However, it introduced a bug in handling events whose pmu does not implement transactions -- at the end of scheduling in the events in the group, in the non-transactional case the code now falls through to the group_error label, and proceeds to unschedule all the events in the group and return failure. This fixes it by returning 0 (success) in the non-transactional case. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Lin Ming Cc: Frederic Weisbecker Cc: eranian@gmail.com LKML-Reference: <20100508105800.GB10650@brick.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bb06382f98e7..180151ff8376 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -664,13 +664,13 @@ group_sched_in(struct perf_event *group_event, } } - if (txn) { - ret = pmu->commit_txn(pmu); - if (!ret) { - pmu->cancel_txn(pmu); + if (!txn) + return 0; - return 0; - } + ret = pmu->commit_txn(pmu); + if (!ret) { + pmu->cancel_txn(pmu); + return 0; } group_error: -- cgit v1.2.3 From c0614829c16ab9d31f1b7d40516decfbf3d32102 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 27 Apr 2010 18:33:12 -0400 Subject: kprobes: Move enable/disable_kprobe() out from debugfs code Move enable/disable_kprobe() API out from debugfs related code, because these interfaces are not related to debugfs interface. This fixes a compiler warning. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Acked-by: Tony Luck Cc: systemtap Cc: DLE LKML-Reference: <20100427223312.2322.60512.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 132 +++++++++++++++++++++++++++---------------------------- 1 file changed, 66 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0ed46f3e51e9..282035f3ae96 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +/* Disable one kprobe */ +int __kprobes disable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + /* If the probe is already disabled (or gone), just return */ + if (kprobe_disabled(kp)) + goto out; + + kp->flags |= KPROBE_FLAG_DISABLED; + if (p != kp) + /* When kp != p, p is always enabled. */ + try_to_disable_aggr_kprobe(p); + + if (!kprobes_all_disarmed && kprobe_disabled(p)) + disarm_kprobe(p); +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(disable_kprobe); + +/* Enable one kprobe */ +int __kprobes enable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + if (kprobe_gone(kp)) { + /* This kprobe has gone, we couldn't enable it. */ + ret = -EINVAL; + goto out; + } + + if (p != kp) + kp->flags &= ~KPROBE_FLAG_DISABLED; + + if (!kprobes_all_disarmed && kprobe_disabled(p)) { + p->flags &= ~KPROBE_FLAG_DISABLED; + arm_kprobe(p); + } +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(enable_kprobe); + void __kprobes dump_kprobe(struct kprobe *kp) { printk(KERN_WARNING "Dumping kprobe:\n"); @@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; -/* Disable one kprobe */ -int __kprobes disable_kprobe(struct kprobe *kp) -{ - int ret = 0; - struct kprobe *p; - - mutex_lock(&kprobe_mutex); - - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } - - /* If the probe is already disabled (or gone), just return */ - if (kprobe_disabled(kp)) - goto out; - - kp->flags |= KPROBE_FLAG_DISABLED; - if (p != kp) - /* When kp != p, p is always enabled. */ - try_to_disable_aggr_kprobe(p); - - if (!kprobes_all_disarmed && kprobe_disabled(p)) - disarm_kprobe(p); -out: - mutex_unlock(&kprobe_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(disable_kprobe); - -/* Enable one kprobe */ -int __kprobes enable_kprobe(struct kprobe *kp) -{ - int ret = 0; - struct kprobe *p; - - mutex_lock(&kprobe_mutex); - - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } - - if (kprobe_gone(kp)) { - /* This kprobe has gone, we couldn't enable it. */ - ret = -EINVAL; - goto out; - } - - if (p != kp) - kp->flags &= ~KPROBE_FLAG_DISABLED; - - if (!kprobes_all_disarmed && kprobe_disabled(p)) { - p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); - } -out: - mutex_unlock(&kprobe_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(enable_kprobe); - static void __kprobes arm_all_kprobes(void) { struct hlist_head *head; -- cgit v1.2.3 From 883a2a3189dae9d2912c417e47152f51cb922a3f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 May 2010 06:16:11 +0200 Subject: tracing: Drop lock_acquired waittime field Drop the waittime field from the lock_acquired event, we can calculate it by substracting the lock_acquired event timestamp with the matching lock_acquire one. It is not needed and takes useless space in the traces. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Hitoshi Mitake Cc: Steven Rostedt --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2594e1ce41cb..31e22e742368 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3380,7 +3380,7 @@ found_it: hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip, waittime); + trace_lock_acquired(lock, ip); stats = get_lock_stats(hlock_class(hlock)); if (waittime) { -- cgit v1.2.3 From 93135439459920c4d856f4ab8f068c030085c8df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 May 2010 06:24:25 +0200 Subject: tracing: Drop the nested field from lock_release event Drop the nested field as we don't use it. Every nested state can be computed from a state machine on post processing already. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Hitoshi Mitake Cc: Steven Rostedt --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 31e22e742368..e9c759f06c1d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3227,7 +3227,7 @@ void lock_release(struct lockdep_map *lock, int nested, raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - trace_lock_release(lock, nested, ip); + trace_lock_release(lock, ip); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); -- cgit v1.2.3 From e3174cfd2a1e28fff774681f00a0eef3d31da970 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 May 2010 08:31:49 +0200 Subject: Revert "perf: Fix exit() vs PERF_FORMAT_GROUP" This reverts commit 4fd38e4595e2f6c9d27732c042a0e16b2753049c. It causes various crashes and hangs when events are activated. The cause is not fully understood yet but we need to revert it because the effects are severe. Reported-by: Stephane Eranian Reported-by: Lin Ming Cc: Peter Zijlstra Cc: Corey Ashford Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 180151ff8376..a9047463fd83 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -334,9 +334,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; - if (event->state > PERF_EVENT_STATE_FREE) - return; - /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -1871,8 +1868,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - event->state = PERF_EVENT_STATE_FREE; - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: -- cgit v1.2.3 From 050735b08ca8a016bbace4445fa025b88fee770b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 May 2010 11:51:53 +0200 Subject: perf: Fix exit() vs PERF_FORMAT_GROUP Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't work as expected if the task the counters were attached to quit before the read() call. The cause is that we unconditionally destroy the grouping when we remove counters from their context. Fix this by splitting off the group destroy from the list removal such that perf_event_remove_from_context() does not do this and change perf_event_release() to do so. Reported-by: Corey Ashford Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: # .34.x LKML-Reference: <1273571513.5605.3527.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a9047463fd83..c97e82518403 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -308,8 +308,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *sibling, *tmp; - if (list_empty(&event->group_entry)) return; ctx->nr_events--; @@ -333,6 +331,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; +} + +static void +perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; /* * If this was a group event with sibling events then @@ -1868,6 +1872,12 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + /* + * Remove from the PMU, can't get re-enabled since we got + * here because the last ref went. + */ + perf_event_disable(event); + WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: @@ -1882,7 +1892,10 @@ int perf_event_release_kernel(struct perf_event *event) * to trigger the AB-BA case. */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - perf_event_remove_from_context(event); + raw_spin_lock_irq(&ctx->lock); + list_del_event(event, ctx); + perf_destroy_group(event, ctx); + raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); mutex_lock(&event->owner->perf_event_mutex); -- cgit v1.2.3 From 96c21a460a37880abfbc8445d5b098dbab958a29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 May 2010 16:19:10 +0200 Subject: perf: Fix exit() vs event-groups Corey reported that the value scale times of group siblings are not updated when the monitored task dies. The problem appears to be that we only update the group leader's time values, fix it by updating the whole group. Reported-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: # .34.x LKML-Reference: <1273588935.1810.6.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c97e82518403..a4fa381db3c2 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -255,6 +255,18 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -320,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; - update_event_times(event); + update_group_times(event); /* * If event was in error state, then keep it @@ -501,18 +513,6 @@ retry: raw_spin_unlock_irq(&ctx->lock); } -/* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - /* * Cross CPU call to disable a performance event */ -- cgit v1.2.3