From eb1e79611cc9bfe21978230e3521e77ea2d7874a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Mar 2010 00:08:59 +0100 Subject: perf: Correctly align perf event tracing buffer The trace event buffer used by perf to record raw sample events is typed as an array of char and may then not be aligned to 8 by alloc_percpu(). But we need it to be aligned to 8 in sparc64 because we cast this buffer into a random structure type built by the TRACE_EVENT() macro to store the traces. So if a random 64 bits field is accessed inside, it may be not under an expected good alignment. Use an array of long instead to force the appropriate alignment, and perform a compile time check to ensure the size in byte of the buffer is a multiple of sizeof(long) so that its actual size doesn't get shrinked under us. This fixes unaligned accesses reported while using perf lock in sparc 64. Suggested-by: David Miller Suggested-by: Tejun Heo Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar Cc: David Miller Cc: Steven Rostedt --- kernel/trace/trace_event_perf.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 81f691eb3a30..0565bb42566f 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -17,7 +17,12 @@ EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); static char *perf_trace_buf; static char *perf_trace_buf_nmi; -typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ; +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * suprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) + perf_trace_t; /* Count the events in use (per event id, not per instance) */ static int total_ref_count; @@ -130,6 +135,8 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, char *trace_buf, *raw_data; int pc, cpu; + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + pc = preempt_count(); /* Protect the per cpu buffer, begin the rcu read side */ @@ -152,7 +159,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); entry = (struct trace_entry *)raw_data; tracing_generic_entry_update(entry, *irq_flags, pc); -- cgit v1.2.3 From e49a5bd38159dfb1928fd25b173bc9de4bbadb21 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 22 Mar 2010 19:40:03 +0100 Subject: perf: Use hot regs with software sched switch/migrate events Scheduler's task migration events don't work because they always pass NULL regs perf_sw_event(). The event hence gets filtered in perf_swevent_add(). Scheduler's context switches events use task_pt_regs() to get the context when the event occured which is a wrong thing to do as this won't give us the place in the kernel where we went to sleep but the place where we left userspace. The result is even more wrong if we switch from a kernel thread. Use the hot regs snapshot for both events as they belong to the non-interrupt/exception based events family. Unlike page faults or so that provide the regs matching the exact origin of the event, we need to save the current context. This makes the task migration event working and fix the context switch callchains and origin ip. Example: perf record -a -e cs Before: 10.91% ksoftirqd/0 0 [k] 0000000000000000 | --- (nil) perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_swevent_ctx_event do_perf_sw_event __perf_sw_event perf_event_task_sched_out schedule run_ksoftirqd kthread kernel_thread_helper After: 23.77% hald-addon-stor [kernel.kallsyms] [k] schedule | --- schedule | |--60.00%-- schedule_timeout | wait_for_common | wait_for_completion | blk_execute_rq | scsi_execute | scsi_execute_req | sr_test_unit_ready | | | |--66.67%-- sr_media_change | | media_changed | | cdrom_media_changed | | sr_block_media_changed | | check_disk_change | | cdrom_open v2: Always build perf_arch_fetch_caller_regs() now that software events need that too. They don't need it from modules, unlike trace events, so we keep the EXPORT_SYMBOL in trace_event_perf.c Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Ingo Molnar Cc: David Miller --- kernel/perf_event.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 574ee58a3046..b0feb4795af3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1164,11 +1164,9 @@ void perf_event_task_sched_out(struct task_struct *task, struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; - struct pt_regs *regs; int do_switch = 1; - regs = task_pt_regs(task); - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; -- cgit v1.2.3 From 8bb39f9aa068262732fe44b965d7a6eb5a5a7d67 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 26 Mar 2010 11:11:33 +0100 Subject: perf: Fix 'perf sched record' deadlock perf sched record can deadlock a box should the holder of handle->data->lock take an interrupt, and then attempt to acquire an rq lock held by a CPU trying to acquire the same lock. Disable interrupts. CPU0 CPU1 sched event with rq->lock held grab handle->data->lock spin on handle->data->lock interrupt try to grab rq->lock Reported-by: Li Zefan Signed-off-by: Mike Galbraith Tested-by: Li Zefan Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <1269598293.6174.8.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b0feb4795af3..96aae13c7960 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3376,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; - int size; struct task_struct *task = task_event->task; - int ret; + unsigned long flags; + int size, ret; + + /* + * If this CPU attempts to acquire an rq lock held by a CPU spinning + * in perf_output_lock() from interrupt context, it's game over. + */ + local_irq_save(flags); size = task_event->event_id.header.size; ret = perf_output_begin(&handle, event, size, 0, 0); - if (ret) + if (ret) { + local_irq_restore(flags); return; + } task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.ppid = perf_event_pid(event, current); @@ -3395,6 +3403,7 @@ static void perf_event_task_output(struct perf_event *event, perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); + local_irq_restore(flags); } static int perf_event_task_match(struct perf_event *event) -- cgit v1.2.3 From 26d80aa782e708c380a47601779d42d30bf016d6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 3 Apr 2010 12:22:05 +0200 Subject: perf: Always build the stub perf_arch_fetch_caller_regs version Now that software events use perf_arch_fetch_caller_regs() too, we need the stub version to be always built in for archs that don't implement it. Fixes the following build error in PARISC: kernel/built-in.o: In function `perf_event_task_sched_out': (.text.perf_event_task_sched_out+0x54): undefined reference to `perf_arch_fetch_caller_regs' Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras --- kernel/perf_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 96aae13c7960..681af806d76b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2784,12 +2784,11 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } -#ifdef CONFIG_EVENT_TRACING __weak void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { } -#endif + /* * Output -- cgit v1.2.3