diff options
76 files changed, 4431 insertions, 902 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 7f418bbc261a..eef3bbb97075 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -126,4 +126,11 @@ config HAVE_DMA_API_DEBUG config HAVE_DEFAULT_NO_SPIN_MUTEXES bool +config HAVE_HW_BREAKPOINT + bool + depends on HAVE_PERF_EVENTS + select ANON_INODES + select PERF_EVENTS + + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 72ace9515a07..178084b4377c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -49,6 +49,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_HW_BREAKPOINT select HAVE_ARCH_KMEMCHECK config OUTPUT_FORMAT diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4a8e80cdcfa5..9f828f87ca35 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,6 +10,7 @@ header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h +header-y += hw_breakpoint.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index bb70e397aa84..7a15588e45d4 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -17,6 +17,7 @@ #include <linux/user.h> #include <linux/elfcore.h> +#include <asm/debugreg.h> /* * fill in the user structure for an a.out core dump @@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg0; - dump->u_debugreg[1] = current->thread.debugreg1; - dump->u_debugreg[2] = current->thread.debugreg2; - dump->u_debugreg[3] = current->thread.debugreg3; - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - dump->u_debugreg[7] = current->thread.debugreg7; + aout_dump_debugregs(dump); if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 3ea6f37be9e2..fdabd8435765 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -18,6 +18,7 @@ #define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP2 (0x4) /* db2 */ #define DR_TRAP3 (0x8) /* db3 */ +#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) #define DR_STEP (0x4000) /* single-step */ #define DR_SWITCH (0x8000) /* task switch */ @@ -49,6 +50,8 @@ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ +#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */ +#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */ #define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ #define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ @@ -67,4 +70,34 @@ #define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ #define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ +/* + * HW breakpoint additions + */ +#ifdef __KERNEL__ + +DECLARE_PER_CPU(unsigned long, dr7); + +static inline void hw_breakpoint_disable(void) +{ + /* Zero the control register for HW Breakpoint */ + set_debugreg(0UL, 7); + + /* Zero-out the individual HW breakpoint address registers */ + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); +} + +static inline int hw_breakpoint_active(void) +{ + return __get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK; +} + +extern void aout_dump_debugregs(struct user *dump); + +extern void hw_breakpoint_restore(void); + +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h new file mode 100644 index 000000000000..0675a7c4c20e --- /dev/null +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -0,0 +1,73 @@ +#ifndef _I386_HW_BREAKPOINT_H +#define _I386_HW_BREAKPOINT_H + +#ifdef __KERNEL__ +#define __ARCH_HW_BREAKPOINT_H + +/* + * The name should probably be something dealt in + * a higher level. While dealing with the user + * (display/resolving) + */ +struct arch_hw_breakpoint { + char *name; /* Contains name of the symbol to set bkpt */ + unsigned long address; + u8 len; + u8 type; +}; + +#include <linux/kdebug.h> +#include <linux/percpu.h> +#include <linux/list.h> + +/* Available HW breakpoint length encodings */ +#define X86_BREAKPOINT_LEN_1 0x40 +#define X86_BREAKPOINT_LEN_2 0x44 +#define X86_BREAKPOINT_LEN_4 0x4c +#define X86_BREAKPOINT_LEN_EXECUTE 0x40 + +#ifdef CONFIG_X86_64 +#define X86_BREAKPOINT_LEN_8 0x48 +#endif + +/* Available HW breakpoint type encodings */ + +/* trigger on instruction execute */ +#define X86_BREAKPOINT_EXECUTE 0x80 +/* trigger on memory write */ +#define X86_BREAKPOINT_WRITE 0x81 +/* trigger on memory read or write */ +#define X86_BREAKPOINT_RW 0x83 + +/* Total number of available HW breakpoint registers */ +#define HBP_NUM 4 + +struct perf_event; +struct pmu; + +extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len); +extern int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk); +extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, + unsigned long val, void *data); + + +int arch_install_hw_breakpoint(struct perf_event *bp); +void arch_uninstall_hw_breakpoint(struct perf_event *bp); +void hw_breakpoint_pmu_read(struct perf_event *bp); +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp); + +extern void +arch_fill_perf_breakpoint(struct perf_event *bp); + +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type); +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type); + +extern int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type); + +extern struct pmu perf_ops_bp; + +#endif /* __KERNEL__ */ +#endif /* _I386_HW_BREAKPOINT_H */ + diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c9786480f0fe..6f8ec1c37e0a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -30,6 +30,7 @@ struct mm_struct; #include <linux/math64.h> #include <linux/init.h> +#define HBP_NUM 4 /* * Default implementation of macro that returns current * instruction pointer ("program counter"). @@ -422,6 +423,8 @@ extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; +struct perf_event; + struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -443,13 +446,10 @@ struct thread_struct { unsigned long fs; #endif unsigned long gs; - /* Hardware debugging registers: */ - unsigned long debugreg0; - unsigned long debugreg1; - unsigned long debugreg2; - unsigned long debugreg3; - unsigned long debugreg6; - unsigned long debugreg7; + /* Save middle states of ptrace breakpoints */ + struct perf_event *ptrace_bps[HBP_NUM]; + /* Debug status used for traps, single steps, etc... */ + unsigned long debugreg6; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d8e5d0cdd678..4f2e66e29ecc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 68537e957a9b..1d2cb383410e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -5,6 +5,7 @@ # Don't trace early stages of a secondary CPU boot ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif # Make sure load_percpu_segment has no stackprotector diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..4d267fb77828 --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,549 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> +#include <linux/irqflags.h> +#include <linux/notifier.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/hw_breakpoint.h> +#include <asm/processor.h> +#include <asm/debugreg.h> + +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, dr7); +EXPORT_PER_CPU_SYMBOL(dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); + +/* + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus + */ +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); + + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) | + DR_GLOBAL_SLOWDOWN; + return bp_info; +} + +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) +{ + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; +} + +/* + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __get_cpu_var(cpu_debugreg[i]) = info->address; + + dr7 = &__get_cpu_var(dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); + + return 0; +} + +/* + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + + dr7 = &__get_cpu_var(dr7); + *dr7 &= ~encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case X86_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case X86_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case X86_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in user space. + */ +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va <= TASK_SIZE - len); +} + +/* + * Check for virtual address in kernel space. + */ +static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Store a breakpoint's encoded address, length, and type. + */ +static int arch_store_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); + if (info->address) + return 0; + + return -EINVAL; +} + +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) +{ + /* Len */ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case X86_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case X86_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + *gen_type = HW_BREAKPOINT_X; + break; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; + break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + return 0; +} + + +static int arch_build_bp_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + info->address = bp->attr.bp_addr; + + /* Len */ + switch (bp->attr.bp_len) { + case HW_BREAKPOINT_LEN_1: + info->len = X86_BREAKPOINT_LEN_1; + break; + case HW_BREAKPOINT_LEN_2: + info->len = X86_BREAKPOINT_LEN_2; + break; + case HW_BREAKPOINT_LEN_4: + info->len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + info->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + break; + default: + return -EINVAL; + } + + return 0; +} +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp); + if (ret) + return ret; + + ret = -EINVAL; + + if (info->type == X86_BREAKPOINT_EXECUTE) + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + if ((!arch_check_va_in_userspace(info->address, info->len)) && + info->len != X86_BREAKPOINT_EXECUTE) + return ret; + + switch (info->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + if (bp->callback) + ret = arch_store_info(bp); + + if (ret < 0) + return ret; + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (info->address & align) + return -EINVAL; + + /* Check that the virtual address is in the proper range */ + if (tsk) { + if (!arch_check_va_in_userspace(info->address, info->len)) + return -EFAULT; + } else { + if (!arch_check_va_in_kernelspace(info->address, info->len)) + return -EFAULT; + } + + return 0; +} + +/* + * Dump the debug register contents to the user. + * We can't dump our per cpu values because it + * may contain cpu wide breakpoint, something that + * doesn't belong to the current task. + * + * TODO: include non-ptrace user breakpoints (perf) + */ +void aout_dump_debugregs(struct user *dump) +{ + int i; + int dr7 = 0; + struct perf_event *bp; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = ¤t->thread; + + for (i = 0; i < HBP_NUM; i++) { + bp = thread->ptrace_bps[i]; + + if (bp && !bp->attr.disabled) { + dump->u_debugreg[i] = bp->attr.bp_addr; + info = counter_arch_bp(bp); + dr7 |= encode_dr7(i, info->len, info->type); + } else { + dump->u_debugreg[i] = 0; + } + } + + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + + dump->u_debugreg[7] = dr7; +} +EXPORT_SYMBOL_GPL(aout_dump_debugregs); + +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } +} + +void hw_breakpoint_restore(void) +{ + set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); + set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); + set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); + set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(__get_cpu_var(dr7), 7); +} +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap<n> set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +static int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct perf_event *bp; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + + /* + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. + */ + rcu_read_lock(); + + bp = per_cpu(bp_per_reg[i], cpu); + if (bp) + rc = NOTIFY_DONE; + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); + /* + * bp can be NULL due to lazy debug register switching + * or due to concurrent perf counter removing. + */ + if (!bp) { + rcu_read_unlock(); + break; + } + + (bp->callback)(bp, args->regs); + + rcu_read_unlock(); + } + if (dr6 & (~DR_TRAP_BITS)) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu(); + + return rc; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} + +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8d82a77a3f3b..34e86b67550c 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,7 @@ #include <linux/smp.h> #include <linux/nmi.h> +#include <asm/debugreg.h> #include <asm/apicdef.h> #include <asm/system.h> @@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) "resuming...\n"); kgdb_arch_handle_exception(args->trapnr, args->signr, args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index c5f1f117e0c0..3fe86d706a14 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -56,6 +56,7 @@ #include <asm/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> +#include <asm/debugreg.h> void jprobe_return_end(void); @@ -945,8 +946,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d00130..c843f8406da2 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -25,6 +25,7 @@ #include <asm/desc.h> #include <asm/system.h> #include <asm/cacheflush.h> +#include <asm/debugreg.h> static void set_idt(void *newidt, __u16 limit) { @@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 84c3bf209e98..4a8bb82248ae 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/debugreg.h> static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) @@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5284cd2b5776..744508e7cfdd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -10,6 +10,7 @@ #include <linux/clockchips.h> #include <linux/random.h> #include <trace/events/power.h> +#include <linux/hw_breakpoint.h> #include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> @@ -17,6 +18,7 @@ #include <asm/uaccess.h> #include <asm/i387.h> #include <asm/ds.h> +#include <asm/debugreg.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -103,14 +105,7 @@ void flush_thread(void) } #endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -192,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf79567cdab..d5bd3132ee70 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -58,6 +58,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/ds.h> +#include <asm/debugreg.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index eb62cbcaa490..70cf15873f3d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,6 +52,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/ds.h> +#include <asm/debugreg.h> asmlinkage extern void ret_from_fork(void); @@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -341,6 +346,7 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + return err; } @@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (preload_fpu) __math_state_restore(); + return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index c4f76d275ee4..b25f8947ed7a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include <linux/seccomp.h> #include <linux/signal.h> #include <linux/workqueue.h> +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -34,6 +36,7 @@ #include <asm/prctl.h> #include <asm/proto.h> #include <asm/ds.h> +#include <asm/hw_breakpoint.h> #include "tls.h" @@ -249,11 +252,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ - return TASK_SIZE - 3; -} - #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) @@ -378,15 +376,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - return IA32_PAGE_OFFSET - 3; -#endif - return TASK_SIZE_MAX - 7; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -566,99 +555,229 @@ static int genregs_set(struct task_struct *target, return ret; } +static void ptrace_triggered(struct perf_event *bp, void *data) +{ + int i; + struct thread_struct *thread = &(current->thread); + + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) + break; + } + + thread->debugreg6 |= (DR_TRAP0 << i); +} + /* - * This function is trivial and will be inlined by the compiler. - * Having it separates the implementation details of debug - * registers from the interface details of ptrace. + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * */ -static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) { - switch (n) { - case 0: return child->thread.debugreg0; - case 1: return child->thread.debugreg1; - case 2: return child->thread.debugreg2; - case 3: return child->thread.debugreg3; - case 6: return child->thread.debugreg6; - case 7: return child->thread.debugreg7; + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } } - return 0; + + return dr7; } -static int ptrace_set_debugreg(struct task_struct *child, - int n, unsigned long data) +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { - int i; + struct thread_struct *thread = &(tsk->thread); + unsigned long old_dr7; + int i, orig_ret = 0, rc = 0; + int enabled, second_pass = 0; + unsigned len, type; + int gen_len, gen_type; + struct perf_event *bp; + + data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); +restore: + /* + * Loop through all the hardware breakpoints, making the + * appropriate changes to each. + */ + for (i = 0; i < HBP_NUM; i++) { + enabled = decode_dr7(data, i, &len, &type); + bp = thread->ptrace_bps[i]; + + if (!enabled) { + if (bp) { + /* + * Don't unregister the breakpoints right-away, + * unless all register_user_hw_breakpoint() + * requests have succeeded. This prevents + * any window of opportunity for debug + * register grabbing by other users. + */ + if (!second_pass) + continue; + thread->ptrace_bps[i] = NULL; + unregister_hw_breakpoint(bp); + } + continue; + } - if (unlikely(n == 4 || n == 5)) - return -EIO; + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ + if (!bp) { + rc = -EINVAL; + break; + } - if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) - return -EIO; + rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + if (rc) + break; - switch (n) { - case 0: child->thread.debugreg0 = data; break; - case 1: child->thread.debugreg1 = data; break; - case 2: child->thread.debugreg2 = data; break; - case 3: child->thread.debugreg3 = data; break; + /* + * This is a temporary thing as bp is unregistered/registered + * to simulate modification + */ + bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len, + gen_type, bp->callback, + tsk, true); + thread->ptrace_bps[i] = NULL; - case 6: - if ((data & ~0xffffffffUL) != 0) - return -EIO; - child->thread.debugreg6 = data; - break; + if (!bp) { /* incorrect bp, or we have a bug in bp API */ + rc = -EINVAL; + break; + } + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + bp = NULL; + break; + } + thread->ptrace_bps[i] = bp; + } + /* + * Make a second pass to free the remaining unused breakpoints + * or to restore the original breakpoints if an error occurred. + */ + if (!second_pass) { + second_pass = 1; + if (rc < 0) { + orig_ret = rc; + data = old_dr7; + } + goto restore; + } + return ((orig_ret < 0) ? orig_ret : rc); +} - case 7: +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long val = 0; + + if (n < HBP_NUM) { + struct perf_event *bp; + bp = thread->ptrace_bps[n]; + if (!bp) + return 0; + val = bp->hw.info.address; + } else if (n == 6) { + val = thread->debugreg6; + } else if (n == 7) { + val = ptrace_get_dr7(thread->ptrace_bps); + } + return val; +} + +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct perf_event *bp; + struct thread_struct *t = &tsk->thread; + + if (!t->ptrace_bps[nr]) { /* - * Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System Programming) + * Put stub len and type to register (reserve) an inactive but + * correct bp */ -#ifdef CONFIG_X86_32 -#define DR7_MASK 0x5f54 -#else -#define DR7_MASK 0x5554 -#endif - data &= ~DR_CONTROL_RESERVED; - for (i = 0; i < 4; i++) - if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; + bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1, + HW_BREAKPOINT_W, + ptrace_triggered, tsk, + false); + } else { + bp = t->ptrace_bps[nr]; + t->ptrace_bps[nr] = NULL; + bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len, + bp->attr.bp_type, + bp->callback, + tsk, + bp->attr.disabled); } + if (!bp) + return -EIO; + /* + * CHECKME: the previous code returned -EIO if the addr wasn't a + * valid task virtual addr. The new one will return -EINVAL in this + * case. + * -EINVAL may be what we want for in-kernel breakpoints users, but + * -EIO looks better for ptrace, since we refuse a register writing + * for the user. And anyway this is the previous behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; + return 0; } /* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +{ + struct thread_struct *thread = &(tsk->thread); + int rc = 0; + + /* There are no DR4 or DR5 registers */ + if (n == 4 || n == 5) + return -EIO; + + if (n == 6) { + thread->debugreg6 = val; + goto ret_path; + } + if (n < HBP_NUM) { + rc = ptrace_set_breakpoint_addr(tsk, n, val); + if (rc) + return rc; + } + /* All that's left is DR7 */ + if (n == 7) + rc = ptrace_write_dr7(tsk, val); + +ret_path: + return rc; +} + +/* * These access the current or another (stopped) task's io permission * bitmap for debugging or core dump. */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 6a44a76055ad..fbf3b07c8567 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7e37dcee0cc3..33399176512a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); /* Catch kmemcheck conditions first of all! */ - if (condition & DR_STEP && kmemcheck_trap(regs)) + if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ae07d261527c..4fc80174191c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -42,6 +42,7 @@ #define CREATE_TRACE_POINTS #include "trace.h" +#include <asm/debugreg.h> #include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> @@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); - if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); - } + /* + * If the guest has used debug registers, at least dr7 + * will be disabled while returning to the host. + * If we don't have active breakpoints in the host, we don't + * care about the messed up debug address registers. But if + * we have some of them active, restore the old state. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 16ccbd77917f..11a4ad4d6253 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) struct die_args *arg = args; if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) + if (post_kmmio_handler(arg->err, arg->regs) == 1) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; return NOTIFY_STOP; + } return NOTIFY_DONE; } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 8aa85f17667e..0a979f3e5b8a 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -18,6 +18,7 @@ #include <asm/mce.h> #include <asm/xcr.h> #include <asm/suspend.h> +#include <asm/debugreg.h> #ifdef CONFIG_X86_32 static struct saved_context saved_context; @@ -142,31 +143,6 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - if (current->thread.debugreg7) { -#ifdef CONFIG_X86_32 - set_debugreg(current->thread.debugreg0, 0); - set_debugreg(current->thread.debugreg1, 1); - set_debugreg(current->thread.debugreg2, 2); - set_debugreg(current->thread.debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); -#else - /* CONFIG_X86_64 */ - loaddebug(¤t->thread, 0); - loaddebug(¤t->thread, 1); - loaddebug(¤t->thread, 2); - loaddebug(¤t->thread, 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); -#endif - } - } /** diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index af75e07217ba..d8214dc03fa7 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -114,6 +114,7 @@ int main(int argc, char **argv) unsigned char insn_buf[16]; struct insn insn; int insns = 0, c; + int warnings = 0; parse_args(argc, argv); @@ -151,18 +152,22 @@ int main(int argc, char **argv) insn_init(&insn, insn_buf, x86_64); insn_get_length(&insn); if (insn.length != nb) { - fprintf(stderr, "Error: %s found a difference at %s\n", + warnings++; + fprintf(stderr, "Warning: %s found difference at %s\n", prog, sym); - fprintf(stderr, "Error: %s", line); - fprintf(stderr, "Error: objdump says %d bytes, but " + fprintf(stderr, "Warning: %s", line); + fprintf(stderr, "Warning: objdump says %d bytes, but " "insn_get_length() says %d\n", nb, insn.length); if (verbose) dump_insn(stderr, &insn); - exit(2); } } - fprintf(stderr, "Succeed: decoded and checked %d instructions\n", - insns); + if (warnings) + fprintf(stderr, "Warning: decoded and checked %d" + " instructions with %d warnings\n", insns, warnings); + else + fprintf(stderr, "Succeed: decoded and checked %d" + " instructions\n", insns); return 0; } diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 43360c1d8f70..47bbdf9c38d0 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -137,13 +137,8 @@ struct ftrace_event_call { #define FTRACE_MAX_PROFILE_SIZE 2048 -struct perf_trace_buf { - char buf[FTRACE_MAX_PROFILE_SIZE]; - int recursion; -}; - -extern struct perf_trace_buf *perf_trace_buf; -extern struct perf_trace_buf *perf_trace_buf_nmi; +extern char *perf_trace_buf; +extern char *perf_trace_buf_nmi; #define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h new file mode 100644 index 000000000000..c9f7f7c7b0e0 --- /dev/null +++ b/include/linux/hw_breakpoint.h @@ -0,0 +1,140 @@ +#ifndef _LINUX_HW_BREAKPOINT_H +#define _LINUX_HW_BREAKPOINT_H + +enum { + HW_BREAKPOINT_LEN_1 = 1, + HW_BREAKPOINT_LEN_2 = 2, + HW_BREAKPOINT_LEN_4 = 4, + HW_BREAKPOINT_LEN_8 = 8, +}; + +enum { + HW_BREAKPOINT_R = 1, + HW_BREAKPOINT_W = 2, + HW_BREAKPOINT_X = 4, +}; + +#ifdef __KERNEL__ + +#include <linux/perf_event.h> + +#ifdef CONFIG_HAVE_HW_BREAKPOINT + +static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) +{ + return bp->attr.bp_addr; +} + +static inline int hw_breakpoint_type(struct perf_event *bp) +{ + return bp->attr.bp_type; +} + +static inline int hw_breakpoint_len(struct perf_event *bp) +{ + return bp->attr.bp_len; +} + +extern struct perf_event * +register_user_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active); + +/* FIXME: only change from the attr, and don't unregister */ +extern struct perf_event * +modify_user_hw_breakpoint(struct perf_event *bp, + unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active); + +/* + * Kernel breakpoints are not associated with any particular thread. + */ +extern struct perf_event * +register_wide_hw_breakpoint_cpu(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + int cpu, + bool active); + +extern struct perf_event ** +register_wide_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + bool active); + +extern int register_perf_hw_breakpoint(struct perf_event *bp); +extern int __register_perf_hw_breakpoint(struct perf_event *bp); +extern void unregister_hw_breakpoint(struct perf_event *bp); +extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events); + +extern int reserve_bp_slot(struct perf_event *bp); +extern void release_bp_slot(struct perf_event *bp); + +extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk); + +static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) +{ + return &bp->hw.info; +} + +#else /* !CONFIG_HAVE_HW_BREAKPOINT */ + +static inline struct perf_event * +register_user_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active) { return NULL; } +static inline struct perf_event * +modify_user_hw_breakpoint(struct perf_event *bp, + unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active) { return NULL; } +static inline struct perf_event * +register_wide_hw_breakpoint_cpu(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + int cpu, + bool active) { return NULL; } +static inline struct perf_event ** +register_wide_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + bool active) { return NULL; } +static inline int +register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } +static inline int +__register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } +static inline void unregister_hw_breakpoint(struct perf_event *bp) { } +static inline void +unregister_wide_hw_breakpoint(struct perf_event **cpu_events) { } +static inline int +reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } +static inline void release_bp_slot(struct perf_event *bp) { } + +static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { } + +static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) +{ + return NULL; +} + +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ +#endif /* __KERNEL__ */ + +#endif /* _LINUX_HW_BREAKPOINT_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7f87563c8485..43adbd7f0010 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -18,6 +18,10 @@ #include <linux/ioctl.h> #include <asm/byteorder.h> +#ifdef CONFIG_HAVE_HW_BREAKPOINT +#include <asm/hw_breakpoint.h> +#endif + /* * User-space ABI bits: */ @@ -31,6 +35,7 @@ enum perf_type_id { PERF_TYPE_TRACEPOINT = 2, PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, + PERF_TYPE_BREAKPOINT = 5, PERF_TYPE_MAX, /* non-ABI */ }; @@ -209,6 +214,15 @@ struct perf_event_attr { __u32 wakeup_events; /* wakeup every n events */ __u32 wakeup_watermark; /* bytes before wakeup */ }; + + union { + struct { /* Hardware breakpoint info */ + __u64 bp_addr; + __u32 bp_type; + __u32 bp_len; + }; + }; + __u32 __reserved_2; __u64 __reserved_3; @@ -478,6 +492,11 @@ struct hw_perf_event { s64 remaining; struct hrtimer hrtimer; }; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + union { /* breakpoint */ + struct arch_hw_breakpoint info; + }; +#endif }; atomic64_t prev_count; u64 sample_period; @@ -546,6 +565,10 @@ struct perf_pending_entry { void (*func)(struct perf_pending_entry *); }; +typedef void (*perf_callback_t)(struct perf_event *, void *); + +struct perf_sample_data; + /** * struct perf_event - performance event kernel representation: */ @@ -588,7 +611,7 @@ struct perf_event { u64 tstamp_running; u64 tstamp_stopped; - struct perf_event_attr attr; + struct perf_event_attr attr; struct hw_perf_event hw; struct perf_event_context *ctx; @@ -637,10 +660,18 @@ struct perf_event { struct pid_namespace *ns; u64 id; + void (*overflow_handler)(struct perf_event *event, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs); + #ifdef CONFIG_EVENT_PROFILE struct event_filter *filter; #endif + perf_callback_t callback; + + perf_callback_t event_callback; + #endif /* CONFIG_PERF_EVENTS */ }; @@ -745,6 +776,14 @@ extern int hw_perf_group_sched_in(struct perf_event *group_leader, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, int cpu); extern void perf_event_update_userpage(struct perf_event *event); +extern int perf_event_release_kernel(struct perf_event *event); +extern struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, + int cpu, + pid_t pid, + perf_callback_t callback); +extern u64 perf_event_read_value(struct perf_event *event, + u64 *enabled, u64 *running); struct perf_sample_data { u64 type; @@ -821,6 +860,7 @@ extern int sysctl_perf_event_sample_rate; extern void perf_event_init(void); extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size); +extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags #define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ @@ -834,6 +874,8 @@ extern int perf_output_begin(struct perf_output_handle *handle, extern void perf_output_end(struct perf_output_handle *handle); extern void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); +extern int perf_swevent_get_recursion_context(void); +extern void perf_swevent_put_recursion_context(int rctx); #else static inline void perf_event_task_sched_in(struct task_struct *task, int cpu) { } @@ -855,11 +897,15 @@ static inline int perf_event_task_enable(void) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { } +static inline void +perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } +static inline int perf_swevent_get_recursion_context(void) { return -1; } +static inline void perf_swevent_put_recursion_context(int rctx) { } #endif diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 4945d1c99864..c3417c13e3ed 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -724,17 +724,20 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static void ftrace_profile_##call(proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + extern int perf_swevent_get_recursion_context(void); \ + extern void perf_swevent_put_recursion_context(int rctx); \ struct ftrace_event_call *event_call = &event_##call; \ extern void perf_tp_event(int, u64, u64, void *, int); \ struct ftrace_raw_##call *entry; \ - struct perf_trace_buf *trace_buf; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ struct trace_entry *ent; \ int __entry_size; \ int __data_size; \ + char *trace_buf; \ char *raw_data; \ int __cpu; \ + int rctx; \ int pc; \ \ pc = preempt_count(); \ @@ -749,6 +752,11 @@ static void ftrace_profile_##call(proto) \ return; \ \ local_irq_save(irq_flags); \ + \ + rctx = perf_swevent_get_recursion_context(); \ + if (rctx < 0) \ + goto end_recursion; \ + \ __cpu = smp_processor_id(); \ \ if (in_nmi()) \ @@ -759,13 +767,7 @@ static void ftrace_profile_##call(proto) \ if (!trace_buf) \ goto end; \ \ - trace_buf = per_cpu_ptr(trace_buf, __cpu); \ - if (trace_buf->recursion++) \ - goto end_recursion; \ - \ - barrier(); \ - \ - raw_data = trace_buf->buf; \ + raw_data = per_cpu_ptr(trace_buf, __cpu); \ \ *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ entry = (struct ftrace_raw_##call *)raw_data; \ @@ -780,9 +782,9 @@ static void ftrace_profile_##call(proto) \ perf_tp_event(event_call->id, __addr, __count, entry, \ __entry_size); \ \ -end_recursion: \ - trace_buf->recursion--; \ end: \ + perf_swevent_put_recursion_context(rctx); \ +end_recursion: \ local_irq_restore(irq_flags); \ \ } diff --git a/kernel/Makefile b/kernel/Makefile index b8d4cd8ac0b9..6b7ce8173dfd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -95,6 +96,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/exit.c b/kernel/exit.c index f7864ac2ecc1..3f45e3cf931d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -49,6 +49,7 @@ #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -978,6 +979,10 @@ NORET_TYPE void do_exit(long code) proc_exit_connector(tsk); /* + * FIXME: do that only when needed, using sched_exit tracepoint + */ + flush_ptrace_hw_breakpoint(tsk); + /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..06d372fc026d --- /dev/null +++ b/kernel/hw_breakpoint.c @@ -0,0 +1,501 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> + * + * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include <linux/irqflags.h> +#include <linux/kallsyms.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <linux/hw_breakpoint.h> + +/* + * Constraints data + */ + +/* Number of pinned cpu breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); + +/* Number of pinned task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); + +/* Number of non-pinned cpu/task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); + +/* Gather the number of total pinned and un-pinned bp in a cpuset */ +struct bp_busy_slots { + unsigned int pinned; + unsigned int flexible; +}; + +/* Serialize accesses to the above constraints */ +static DEFINE_MUTEX(nr_bp_mutex); + +/* + * Report the maximum number of pinned breakpoints a task + * have in this cpu + */ +static unsigned int max_task_bp_pinned(int cpu) +{ + int i; + unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); + + for (i = HBP_NUM -1; i >= 0; i--) { + if (tsk_pinned[i] > 0) + return i + 1; + } + + return 0; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) +{ + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + slots->pinned += max_task_bp_pinned(cpu); + slots->flexible = per_cpu(nr_bp_flexible, cpu); + + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned, cpu); + nr += max_task_bp_pinned(cpu); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible, cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +{ + int count = 0; + struct perf_event *bp; + struct perf_event_context *ctx = tsk->perf_event_ctxp; + unsigned int *task_bp_pinned; + struct list_head *list; + unsigned long flags; + + if (WARN_ONCE(!ctx, "No perf context for this task")) + return; + + list = &ctx->event_list; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * The current breakpoint counter is not included in the list + * at the open() callback time + */ + list_for_each_entry(bp, list, event_entry) { + if (bp->attr.type == PERF_TYPE_BREAKPOINT) + count++; + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) + return; + + task_bp_pinned = per_cpu(task_bp_pinned, cpu); + if (enable) { + task_bp_pinned[count]++; + if (count > 0) + task_bp_pinned[count-1]--; + } else { + task_bp_pinned[count]--; + if (count > 0) + task_bp_pinned[count-1]++; + } +} + +/* + * Add/remove the given breakpoint in our constraint table + */ +static void toggle_bp_slot(struct perf_event *bp, bool enable) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + + /* Pinned counter task profiling */ + if (tsk) { + if (cpu >= 0) { + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + for_each_online_cpu(cpu) + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + /* Pinned counter cpu profiling */ + if (enable) + per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + else + per_cpu(nr_cpu_bp_pinned, bp->cpu)--; +} + +/* + * Contraints to check before allowing this new breakpoint counter: + * + * == Non-pinned counter == (Considered as pinned for now) + * + * - If attached to a single cpu, check: + * + * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM + * + * -> If there are already non-pinned counters in this cpu, it means + * there is already a free slot for them. + * Otherwise, we check that the maximum number of per task + * breakpoints (for this cpu) plus the number of per cpu breakpoint + * (for this cpu) doesn't cover every registers. + * + * - If attached to every cpus, check: + * + * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM + * + * -> This is roughly the same, except we check the number of per cpu + * bp for every cpu and we keep the max one. Same for the per tasks + * breakpoints. + * + * + * == Pinned counter == + * + * - If attached to a single cpu, check: + * + * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM + * + * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * one register at least (or they will never be fed). + * + * - If attached to every cpus, check: + * + * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM + */ +int reserve_bp_slot(struct perf_event *bp) +{ + struct bp_busy_slots slots = {0}; + int ret = 0; + + mutex_lock(&nr_bp_mutex); + + fetch_bp_busy_slots(&slots, bp->cpu); + + /* Flexible counters need to keep at least one slot */ + if (slots.pinned + (!!slots.flexible) == HBP_NUM) { + ret = -ENOSPC; + goto end; + } + + toggle_bp_slot(bp, true); + +end: + mutex_unlock(&nr_bp_mutex); + + return ret; +} + +void release_bp_slot(struct perf_event *bp) +{ + mutex_lock(&nr_bp_mutex); + + toggle_bp_slot(bp, false); + + mutex_unlock(&nr_bp_mutex); +} + + +int __register_perf_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = reserve_bp_slot(bp); + if (ret) + return ret; + + /* + * Ptrace breakpoints can be temporary perf events only + * meant to reserve a slot. In this case, it is created disabled and + * we don't want to check the params right now (as we put a null addr) + * But perf tools create events as disabled and we want to check + * the params for them. + * This is a quick hack that will be removed soon, once we remove + * the tmp breakpoints from ptrace + */ + if (!bp->attr.disabled || bp->callback == perf_bp_event) + ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + + return ret; +} + +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + bp->callback = perf_bp_event; + + return __register_perf_hw_breakpoint(bp); +} + +/* + * Register a breakpoint bound to a task and a given cpu. + * If cpu is -1, the breakpoint is active for the task in every cpu + * If the task is -1, the breakpoint is active for every tasks in the given + * cpu. + */ +static struct perf_event * +register_user_hw_breakpoint_cpu(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + pid_t pid, + int cpu, + bool active) +{ + struct perf_event_attr *attr; + struct perf_event *bp; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return ERR_PTR(-ENOMEM); + + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(*attr); + attr->bp_addr = addr; + attr->bp_len = len; + attr->bp_type = type; + /* + * Such breakpoints are used by debuggers to trigger signals when + * we hit the excepted memory op. We can't miss such events, they + * must be pinned. + */ + attr->pinned = 1; + + if (!active) + attr->disabled = 1; + + bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered); + kfree(attr); + + return bp; +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @addr: is the memory address that triggers the breakpoint + * @len: the length of the access to the memory (1 byte, 2 bytes etc...) + * @type: the type of the access to the memory (read/write/exec) + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + * @active: should we activate it while registering it + * + */ +struct perf_event * +register_user_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active) +{ + return register_user_hw_breakpoint_cpu(addr, len, type, triggered, + tsk->pid, -1, active); +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @addr: is the memory address that triggers the breakpoint + * @len: the length of the access to the memory (1 byte, 2 bytes etc...) + * @type: the type of the access to the memory (read/write/exec) + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + * @active: should we activate it while registering it + */ +struct perf_event * +modify_user_hw_breakpoint(struct perf_event *bp, + unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active) +{ + /* + * FIXME: do it without unregistering + * - We don't want to lose our slot + * - If the new bp is incorrect, don't lose the older one + */ + unregister_hw_breakpoint(bp); + + return register_user_hw_breakpoint(addr, len, type, triggered, + tsk, active); +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint + * @bp: the breakpoint structure to unregister + */ +void unregister_hw_breakpoint(struct perf_event *bp) +{ + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +static struct perf_event * +register_kernel_hw_breakpoint_cpu(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + int cpu, + bool active) +{ + return register_user_hw_breakpoint_cpu(addr, len, type, triggered, + -1, cpu, active); +} + +/** + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @addr: is the memory address that triggers the breakpoint + * @len: the length of the access to the memory (1 byte, 2 bytes etc...) + * @type: the type of the access to the memory (read/write/exec) + * @triggered: callback to trigger when we hit the breakpoint + * @active: should we activate it while registering it + * + * @return a set of per_cpu pointers to perf events + */ +struct perf_event ** +register_wide_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + bool active) +{ + struct perf_event **cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = register_kernel_hw_breakpoint_cpu(addr, len, type, + triggered, cpu, active); + + *pevent = bp; + + if (IS_ERR(bp) || !bp) { + err = PTR_ERR(bp); + goto fail; + } + } + + return cpu_events; + +fail: + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent) || !*pevent) + break; + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); + /* return the error if any */ + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); + +/** + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister + */ +void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +{ + int cpu; + struct perf_event **pevent; + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); +} +EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static int __init init_hw_breakpoint(void) +{ + return register_die_notifier(&hw_breakpoint_exceptions_nb); +} +core_initcall(init_hw_breakpoint); + + +struct pmu perf_ops_bp = { + .enable = arch_install_hw_breakpoint, + .disable = arch_uninstall_hw_breakpoint, + .read = hw_breakpoint_pmu_read, + .unthrottle = hw_breakpoint_pmu_unthrottle +}; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8b6b8b697c68..8e5288a8a355 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } +EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3256e36ad251..accfd7bfe387 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -29,6 +29,7 @@ #include <linux/kernel_stat.h> #include <linux/perf_event.h> #include <linux/ftrace_event.h> +#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> @@ -245,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx) put_ctx(ctx); } +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -293,6 +337,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; + update_event_times(event); + event->state = PERF_EVENT_STATE_OFF; + /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -446,50 +493,11 @@ retry: * can remove the event safely, if the call above did not * succeed. */ - if (!list_empty(&event->group_entry)) { + if (!list_empty(&event->group_entry)) list_del_event(event, ctx); - } spin_unlock_irq(&ctx->lock); } -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_event_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - event->total_time_enabled = ctx->time - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = ctx->time; - - event->total_time_running = run_end - event->tstamp_running; -} - /* * Update total_time_enabled and total_time_running for all events in a group. */ @@ -1032,10 +1040,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx, update_context_time(ctx); perf_disable(); - if (ctx->nr_active) + if (ctx->nr_active) { list_for_each_entry(event, &ctx->group_list, group_entry) group_sched_out(event, cpuctx, ctx); - + } perf_enable(); out: spin_unlock(&ctx->lock); @@ -1060,8 +1068,6 @@ static int context_equiv(struct perf_event_context *ctx1, && !ctx1->pin_count && !ctx2->pin_count; } -static void __perf_event_read(void *event); - static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { @@ -1079,8 +1085,8 @@ static void __perf_event_sync_stat(struct perf_event *event, */ switch (event->state) { case PERF_EVENT_STATE_ACTIVE: - __perf_event_read(event); - break; + event->pmu->read(event); + /* fall-through */ case PERF_EVENT_STATE_INACTIVE: update_event_times(event); @@ -1119,6 +1125,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, if (!ctx->nr_stat) return; + update_context_time(ctx); + event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); @@ -1162,8 +1170,6 @@ void perf_event_task_sched_out(struct task_struct *task, if (likely(!ctx || !cpuctx->task_ctx)) return; - update_context_time(ctx); - rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp; @@ -1516,7 +1522,6 @@ static void __perf_event_read(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - unsigned long flags; /* * If this is a task context, we need to check whether it is @@ -1528,12 +1533,12 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - event->pmu->read(event); + spin_lock(&ctx->lock); + update_context_time(ctx); update_event_times(event); - local_irq_restore(flags); + spin_unlock(&ctx->lock); + + event->pmu->read(event); } static u64 perf_event_read(struct perf_event *event) @@ -1546,7 +1551,13 @@ static u64 perf_event_read(struct perf_event *event) smp_call_function_single(event->oncpu, __perf_event_read, event, 1); } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); update_event_times(event); + spin_unlock_irqrestore(&ctx->lock, flags); } return atomic64_read(&event->count); @@ -1700,16 +1711,10 @@ static void free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) +int perf_event_release_kernel(struct perf_event *event) { - struct perf_event *event = file->private_data; struct perf_event_context *ctx = event->ctx; - file->private_data = NULL; - WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_event_remove_from_context(event); @@ -1724,6 +1729,19 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + + file->private_data = NULL; + + return perf_event_release_kernel(event); +} static int perf_event_read_size(struct perf_event *event) { @@ -1750,91 +1768,94 @@ static int perf_event_read_size(struct perf_event *event) return size; } -static u64 perf_event_read_value(struct perf_event *event) +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; + *enabled = 0; + *running = 0; + + mutex_lock(&event->child_mutex); total += perf_event_read(event); - list_for_each_entry(child, &event->child_list, child_list) + *enabled += event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + *running += event->total_time_running + + atomic64_read(&event->child_total_time_running); + + list_for_each_entry(child, &event->child_list, child_list) { total += perf_event_read(child); + *enabled += child->total_time_enabled; + *running += child->total_time_running; + } + mutex_unlock(&event->child_mutex); return total; } - -static int perf_event_read_entry(struct perf_event *event, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} +EXPORT_SYMBOL_GPL(perf_event_read_value); static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; + int n = 0, size = 0, ret = -EFAULT; + struct perf_event_context *ctx = leader->ctx; + u64 values[5]; + u64 count, enabled, running; + + mutex_lock(&ctx->mutex); + count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + values[n++] = count; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_event_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; + goto unlock; - size += err; + ret = size; list_for_each_entry(sub, &leader->sibling_list, group_entry) { - err = perf_event_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; + n = 0; - size += err; + values[n++] = perf_event_read_value(sub, &enabled, &running); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + size = n * sizeof(u64); + + if (copy_to_user(buf + ret, values, size)) { + ret = -EFAULT; + goto unlock; + } + + ret += size; } +unlock: + mutex_unlock(&ctx->mutex); - return size; + return ret; } static int perf_event_read_one(struct perf_event *event, u64 read_format, char __user *buf) { + u64 enabled, running; u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + - atomic64_read(&event->child_total_time_running); - } + values[n++] = perf_event_read_value(event, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); @@ -1865,12 +1886,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->child_mutex); if (read_format & PERF_FORMAT_GROUP) ret = perf_event_read_group(event, read_format, buf); else ret = perf_event_read_one(event, read_format, buf); - mutex_unlock(&event->child_mutex); return ret; } @@ -2315,7 +2334,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) } if (!data->watermark) - data->watermark = max_t(long, PAGE_SIZE, max_size / 2); + data->watermark = max_size / 2; rcu_assign_pointer(event->data, data); @@ -3245,15 +3264,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_task_match(event)) perf_event_task_output(event, task_event); } - rcu_read_unlock(); } static void perf_event_task_event(struct perf_task_event *task_event) @@ -3261,11 +3275,11 @@ static void perf_event_task_event(struct perf_task_event *task_event) struct perf_cpu_context *cpuctx; struct perf_event_context *ctx = task_event->task_ctx; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); if (!ctx) ctx = rcu_dereference(task_event->task->perf_event_ctxp); if (ctx) @@ -3357,15 +3371,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_comm_match(event)) perf_event_comm_output(event, comm_event); } - rcu_read_unlock(); } static void perf_event_comm_event(struct perf_comm_event *comm_event) @@ -3376,7 +3385,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) char comm[TASK_COMM_LEN]; memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); + strlcpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3384,11 +3393,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3481,15 +3490,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_mmap_match(event, mmap_event)) perf_event_mmap_output(event, mmap_event); } - rcu_read_unlock(); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -3545,11 +3549,11 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3688,7 +3692,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, perf_event_disable(event); } - perf_event_output(event, nmi, data, regs); + if (event->overflow_handler) + event->overflow_handler(event, nmi, data, regs); + else + perf_event_output(event, nmi, data, regs); + return ret; } @@ -3733,16 +3741,16 @@ again: return nr; } -static void perf_swevent_overflow(struct perf_event *event, +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; - u64 overflow; data->period = event->hw.last_period; - overflow = perf_swevent_set_period(event); + if (!overflow) + overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; @@ -3775,14 +3783,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, atomic64_add(nr, &event->count); + if (!regs) + return; + if (!hwc->sample_period) return; - if (!regs) + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, nmi, data, regs); + + if (atomic64_add_negative(nr, &hwc->period_left)) return; - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swevent_overflow(event, nmi, data, regs); + perf_swevent_overflow(event, 0, nmi, data, regs); } static int perf_swevent_is_counting(struct perf_event *event) @@ -3818,6 +3831,20 @@ static int perf_swevent_is_counting(struct perf_event *event) static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data); +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, u32 event_id, @@ -3829,16 +3856,12 @@ static int perf_swevent_match(struct perf_event *event, if (event->attr.type != type) return 0; + if (event->attr.config != event_id) return 0; - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 0; - - if (event->attr.exclude_kernel && !user_mode(regs)) - return 0; - } + if (perf_exclude_event(event, regs)) + return 0; if (event->attr.type == PERF_TYPE_TRACEPOINT && !perf_tp_event_match(event, data)) @@ -3855,49 +3878,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } - rcu_read_unlock(); } -static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +int perf_swevent_get_recursion_context(void) { + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int rctx; + if (in_nmi()) - return &cpuctx->recursion[3]; + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (cpuctx->recursion[rctx]) { + put_cpu_var(perf_cpu_context); + return -1; + } - if (in_irq()) - return &cpuctx->recursion[2]; + cpuctx->recursion[rctx]++; + barrier(); - if (in_softirq()) - return &cpuctx->recursion[1]; + return rctx; +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); - return &cpuctx->recursion[0]; +void perf_swevent_put_recursion_context(int rctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + barrier(); + cpuctx->recursion[rctx]++; + put_cpu_var(perf_cpu_context); } +EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - + cpuctx = &__get_cpu_var(perf_cpu_context); + rcu_read_lock(); perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, nr, nmi, data, regs); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3906,23 +3939,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (ctx) perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - struct perf_sample_data data = { - .addr = addr, - }; + struct perf_sample_data data; + int rctx; - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, - &data, regs); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return; + + data.addr = addr; + data.raw = NULL; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + + perf_swevent_put_recursion_context(rctx); } static void perf_swevent_read(struct perf_event *event) @@ -4145,6 +4179,7 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, if (!regs) regs = task_pt_regs(current); + /* Trace events already protected against recursion */ do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data, regs); } @@ -4231,6 +4266,57 @@ static void perf_event_free_filter(struct perf_event *event) #endif /* CONFIG_EVENT_PROFILE */ +#ifdef CONFIG_HAVE_HW_BREAKPOINT +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + int err; + /* + * The breakpoint is already filled if we haven't created the counter + * through perf syscall + * FIXME: manage to get trigerred to NULL if it comes from syscalls + */ + if (!bp->callback) + err = register_perf_hw_breakpoint(bp); + else + err = __register_perf_hw_breakpoint(bp); + if (err) + return ERR_PTR(err); + + bp->destroy = bp_perf_event_destroy; + + return &perf_ops_bp; +} + +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + sample.addr = bp->attr.bp_addr; + + if (!perf_exclude_event(bp, regs)) + perf_swevent_add(bp, 1, 1, &sample, regs); +} +#else +static void bp_perf_event_destroy(struct perf_event *event) +{ +} + +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + return NULL; +} + +void perf_bp_event(struct perf_event *bp, void *regs) +{ +} +#endif + atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) @@ -4297,6 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, + perf_callback_t callback, gfp_t gfpflags) { const struct pmu *pmu; @@ -4339,6 +4426,11 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; + if (!callback && parent_event) + callback = parent_event->callback; + + event->callback = callback; + if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4373,6 +4465,11 @@ perf_event_alloc(struct perf_event_attr *attr, pmu = tp_perf_event_init(event); break; + case PERF_TYPE_BREAKPOINT: + pmu = bp_perf_event_init(event); + break; + + default: break; } @@ -4615,7 +4712,7 @@ SYSCALL_DEFINE5(perf_event_open, } event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); + NULL, NULL, GFP_KERNEL); err = PTR_ERR(event); if (IS_ERR(event)) goto err_put_context; @@ -4663,6 +4760,58 @@ err_put_context: return err; } +/** + * perf_event_create_kernel_counter + * + * @attr: attributes of the counter to create + * @cpu: cpu in which the counter is bound + * @pid: task to profile + */ +struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + pid_t pid, perf_callback_t callback) +{ + struct perf_event *event; + struct perf_event_context *ctx; + int err; + + /* + * Get the target context (task or percpu): + */ + + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return NULL; + + event = perf_event_alloc(attr, cpu, ctx, NULL, + NULL, callback, GFP_KERNEL); + err = PTR_ERR(event); + if (IS_ERR(event)) + goto err_put_context; + + event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + + return event; + +err_put_context: + if (err < 0) + put_ctx(ctx); + + return NULL; +} +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); + /* * inherit a event from parent task to child task: */ @@ -4688,7 +4837,7 @@ inherit_event(struct perf_event *parent_event, child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child_ctx, group_leader, parent_event, - GFP_KERNEL); + NULL, GFP_KERNEL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -4706,6 +4855,8 @@ inherit_event(struct perf_event *parent_event, if (parent_event->attr.freq) child_event->hw.sample_period = parent_event->hw.sample_period; + child_event->overflow_handler = parent_event->overflow_handler; + /* * Link it up in the child's context: */ @@ -4795,7 +4946,6 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - update_event_times(child_event); perf_event_remove_from_context(child_event); parent_event = child_event->parent; @@ -4847,6 +4997,7 @@ void perf_event_exit_task(struct task_struct *child) * the events from it. */ unclone_ctx(child_ctx); + update_context_time(child_ctx); spin_unlock_irqrestore(&child_ctx->lock, flags); /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f05671609a89..d006554888dc 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -339,6 +339,27 @@ config POWER_TRACER power management decisions, specifically the C-state and P-state behavior. +config KSYM_TRACER + bool "Trace read and write access on kernel memory locations" + depends on HAVE_HW_BREAKPOINT + select TRACING + help + This tracer helps find read and write operations on any given kernel + symbol i.e. /proc/kallsyms. + +config PROFILE_KSYM_TRACER + bool "Profile all kernel memory accesses on 'watched' variables" + depends on KSYM_TRACER + help + This tracer profiles kernel accesses on variables watched through the + ksym tracer ftrace plugin. Depending upon the hardware, all read + and write operations on kernel variables can be monitored for + accesses. + + The results will be displayed in: + /debugfs/tracing/profile_ksym + + Say N if unsure. config STACK_TRACER bool "Trace max stack" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index edc3a3cca1a1..cd9ecd89ec77 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b4e4212e66d7..4da6ede74401 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> +#include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -37,6 +38,7 @@ enum trace_type { TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, + TRACE_KSYM, __TRACE_LAST_TYPE, }; @@ -232,6 +234,7 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ + IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -387,6 +390,8 @@ int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); + extern unsigned long nsecs_to_usecs(unsigned long nsecs); #ifdef CONFIG_TRACER_MAX_TRACE @@ -461,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_ksym(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ead3d724599d..c16a08f399df 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, F_printk("type:%u call_site:%lx ptr:%p", __entry->type_id, __entry->call_site, __entry->ptr) ); + +FTRACE_ENTRY(ksym_trace, ksym_trace_entry, + + TRACE_KSYM, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned char, type ) + __array( char , cmd, TASK_COMM_LEN ) + __field( unsigned long, addr ) + ), + + F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", + (void *)__entry->ip, (unsigned int)__entry->type, + (void *)__entry->addr, __entry->cmd) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index e0d351b01f5a..d9c60f80aa0d 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -9,31 +9,33 @@ #include "trace.h" -struct perf_trace_buf *perf_trace_buf; +char *perf_trace_buf; EXPORT_SYMBOL_GPL(perf_trace_buf); -struct perf_trace_buf *perf_trace_buf_nmi; +char *perf_trace_buf_nmi; EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); +typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; + /* Count the events in use (per event id, not per instance) */ static int total_profile_count; static int ftrace_profile_enable_event(struct ftrace_event_call *event) { - struct perf_trace_buf *buf; + char *buf; int ret = -ENOMEM; if (atomic_inc_return(&event->profile_count)) return 0; if (!total_profile_count) { - buf = alloc_percpu(struct perf_trace_buf); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf; rcu_assign_pointer(perf_trace_buf, buf); - buf = alloc_percpu(struct perf_trace_buf); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf_nmi; @@ -79,7 +81,7 @@ int ftrace_profile_enable(int event_id) static void ftrace_profile_disable_event(struct ftrace_event_call *event) { - struct perf_trace_buf *buf, *nmi_buf; + char *buf, *nmi_buf; if (!atomic_add_negative(-1, &event->profile_count)) return; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3696476f307d..79ce6a2bd74f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1208,11 +1208,12 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry *entry; - struct perf_trace_buf *trace_buf; struct trace_entry *ent; int size, __size, i, pc, __cpu; unsigned long irq_flags; + char *trace_buf; char *raw_data; + int rctx; pc = preempt_count(); __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); @@ -1227,6 +1228,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, * This also protects the rcu read side */ local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + __cpu = smp_processor_id(); if (in_nmi()) @@ -1237,18 +1243,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, if (!trace_buf) goto end; - trace_buf = per_cpu_ptr(trace_buf, __cpu); - - if (trace_buf->recursion++) - goto end_recursion; - - /* - * Make recursion update visible before entering perf_tp_event - * so that we protect from perf recursions. - */ - barrier(); - - raw_data = trace_buf->buf; + raw_data = per_cpu_ptr(trace_buf, __cpu); /* Zero dead bytes from alignment to avoid buffer leak to userspace */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -1263,9 +1258,9 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, entry->args[i] = call_fetch(&tp->args[i].fetch, regs); perf_tp_event(call->id, entry->ip, 1, entry, size); -end_recursion: - trace_buf->recursion--; end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(irq_flags); return 0; @@ -1278,11 +1273,12 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry *entry; - struct perf_trace_buf *trace_buf; struct trace_entry *ent; int size, __size, i, pc, __cpu; unsigned long irq_flags; + char *trace_buf; char *raw_data; + int rctx; pc = preempt_count(); __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); @@ -1297,6 +1293,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, * This also protects the rcu read side */ local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + __cpu = smp_processor_id(); if (in_nmi()) @@ -1307,18 +1308,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, if (!trace_buf) goto end; - trace_buf = per_cpu_ptr(trace_buf, __cpu); - - if (trace_buf->recursion++) - goto end_recursion; - - /* - * Make recursion update visible before entering perf_tp_event - * so that we protect from perf recursions. - */ - barrier(); - - raw_data = trace_buf->buf; + raw_data = per_cpu_ptr(trace_buf, __cpu); /* Zero dead bytes from alignment to avoid buffer leak to userspace */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -1334,9 +1324,9 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, entry->args[i] = call_fetch(&tp->args[i].fetch, regs); perf_tp_event(call->id, entry->ret_ip, 1, entry, size); -end_recursion: - trace_buf->recursion--; end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(irq_flags); return 0; diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c new file mode 100644 index 000000000000..11935b53a6cb --- /dev/null +++ b/kernel/trace/trace_ksym.c @@ -0,0 +1,554 @@ +/* + * trace_ksym.c - Kernel Symbol Tracer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/fs.h> + +#include "trace_output.h" +#include "trace_stat.h" +#include "trace.h" + +#include <linux/hw_breakpoint.h> +#include <asm/hw_breakpoint.h> + +/* + * For now, let us restrict the no. of symbols traced simultaneously to number + * of available hardware breakpoint registers. + */ +#define KSYM_TRACER_MAX HBP_NUM + +#define KSYM_TRACER_OP_LEN 3 /* rw- */ + +struct trace_ksym { + struct perf_event **ksym_hbp; + unsigned long ksym_addr; + int type; + int len; +#ifdef CONFIG_PROFILE_KSYM_TRACER + unsigned long counter; +#endif + struct hlist_node ksym_hlist; +}; + +static struct trace_array *ksym_trace_array; + +static unsigned int ksym_filter_entry_count; +static unsigned int ksym_tracing_enabled; + +static HLIST_HEAD(ksym_filter_head); + +static DEFINE_MUTEX(ksym_tracer_mutex); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + +#define MAX_UL_INT 0xffffffff + +void ksym_collect_stats(unsigned long hbp_hit_addr) +{ + struct hlist_node *node; + struct trace_ksym *entry; + + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { + if ((entry->ksym_addr == hbp_hit_addr) && + (entry->counter <= MAX_UL_INT)) { + entry->counter++; + break; + } + } + rcu_read_unlock(); +} +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + +void ksym_hbp_handler(struct perf_event *hbp, void *data) +{ + struct ring_buffer_event *event; + struct ksym_trace_entry *entry; + struct pt_regs *regs = data; + struct ring_buffer *buffer; + int pc; + + if (!ksym_tracing_enabled) + return; + + buffer = ksym_trace_array->buffer; + + pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, + sizeof(*entry), 0, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = instruction_pointer(regs); + entry->type = hw_breakpoint_type(hbp); + entry->addr = hw_breakpoint_addr(hbp); + strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + ksym_collect_stats(hw_breakpoint_addr(hbp)); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +/* Valid access types are represented as + * + * rw- : Set Read/Write Access Breakpoint + * -w- : Set Write Access Breakpoint + * --- : Clear Breakpoints + * --x : Set Execution Break points (Not available yet) + * + */ +static int ksym_trace_get_access_type(char *str) +{ + int access = 0; + + if (str[0] == 'r') + access |= HW_BREAKPOINT_R; + + if (str[1] == 'w') + access |= HW_BREAKPOINT_W; + + if (str[2] == 'x') + access |= HW_BREAKPOINT_X; + + switch (access) { + case HW_BREAKPOINT_R: + case HW_BREAKPOINT_W: + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + return access; + default: + return -EINVAL; + } +} + +/* + * There can be several possible malformed requests and we attempt to capture + * all of them. We enumerate some of the rules + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. + * i.e. multiple ':' symbols disallowed. Possible uses are of the form + * <module>:<ksym_name>:<op>. + * 2. No delimiter symbol ':' in the input string + * 3. Spurious operator symbols or symbols not in their respective positions + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file + * 5. Kernel symbol not a part of /proc/kallsyms + * 6. Duplicate requests + */ +static int parse_ksym_trace_str(char *input_string, char **ksymname, + unsigned long *addr) +{ + int ret; + + *ksymname = strsep(&input_string, ":"); + *addr = kallsyms_lookup_name(*ksymname); + + /* Check for malformed request: (2), (1) and (5) */ + if ((!input_string) || + (strlen(input_string) != KSYM_TRACER_OP_LEN) || + (*addr == 0)) + return -EINVAL;; + + ret = ksym_trace_get_access_type(input_string); + + return ret; +} + +int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) +{ + struct trace_ksym *entry; + int ret = -ENOMEM; + + if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { + printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" + " new requests for tracing can be accepted now.\n", + KSYM_TRACER_MAX); + return -ENOSPC; + } + + entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->type = op; + entry->ksym_addr = addr; + entry->len = HW_BREAKPOINT_LEN_4; + + ret = -EAGAIN; + entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr, + entry->len, entry->type, + ksym_hbp_handler, true); + if (IS_ERR(entry->ksym_hbp)) { + entry->ksym_hbp = NULL; + ret = PTR_ERR(entry->ksym_hbp); + } + + if (!entry->ksym_hbp) { + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + goto err; + } + + hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); + ksym_filter_entry_count++; + + return 0; + +err: + kfree(entry); + + return ret; +} + +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + struct trace_seq *s; + ssize_t cnt = 0; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + trace_seq_init(s); + + mutex_lock(&ksym_tracer_mutex); + + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr); + if (entry->type == HW_BREAKPOINT_R) + ret = trace_seq_puts(s, "r--\n"); + else if (entry->type == HW_BREAKPOINT_W) + ret = trace_seq_puts(s, "-w-\n"); + else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) + ret = trace_seq_puts(s, "rw-\n"); + WARN_ON_ONCE(!ret); + } + + cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + mutex_unlock(&ksym_tracer_mutex); + + kfree(s); + + return cnt; +} + +static void __ksym_trace_reset(void) +{ + struct trace_ksym *entry; + struct hlist_node *node, *node1; + + mutex_lock(&ksym_tracer_mutex); + hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, + ksym_hlist) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + } + mutex_unlock(&ksym_tracer_mutex); +} + +static ssize_t ksym_trace_filter_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + char *input_string, *ksymname = NULL; + unsigned long ksym_addr = 0; + int ret, op, changed = 0; + + input_string = kzalloc(count + 1, GFP_KERNEL); + if (!input_string) + return -ENOMEM; + + if (copy_from_user(input_string, buffer, count)) { + kfree(input_string); + return -EFAULT; + } + input_string[count] = '\0'; + + strstrip(input_string); + + /* + * Clear all breakpoints if: + * 1: echo > ksym_trace_filter + * 2: echo 0 > ksym_trace_filter + * 3: echo "*:---" > ksym_trace_filter + */ + if (!input_string[0] || !strcmp(input_string, "0") || + !strcmp(input_string, "*:---")) { + __ksym_trace_reset(); + kfree(input_string); + return count; + } + + ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); + if (ret < 0) { + kfree(input_string); + return ret; + } + + mutex_lock(&ksym_tracer_mutex); + + ret = -EINVAL; + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + if (entry->ksym_addr == ksym_addr) { + /* Check for malformed request: (6) */ + if (entry->type != op) + changed = 1; + else + goto out; + break; + } + } + if (changed) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + entry->type = op; + if (op > 0) { + entry->ksym_hbp = + register_wide_hw_breakpoint(entry->ksym_addr, + entry->len, entry->type, + ksym_hbp_handler, true); + if (IS_ERR(entry->ksym_hbp)) + entry->ksym_hbp = NULL; + if (!entry->ksym_hbp) + goto out; + } + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + ret = 0; + goto out; + } else { + /* Check for malformed request: (4) */ + if (op == 0) + goto out; + ret = process_new_ksym_entry(ksymname, op, ksym_addr); + } +out: + mutex_unlock(&ksym_tracer_mutex); + + kfree(input_string); + + if (!ret) + ret = count; + return ret; +} + +static const struct file_operations ksym_tracing_fops = { + .open = tracing_open_generic, + .read = ksym_trace_filter_read, + .write = ksym_trace_filter_write, +}; + +static void ksym_trace_reset(struct trace_array *tr) +{ + ksym_tracing_enabled = 0; + __ksym_trace_reset(); +} + +static int ksym_trace_init(struct trace_array *tr) +{ + int cpu, ret = 0; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + ksym_tracing_enabled = 1; + ksym_trace_array = tr; + + return ret; +} + +static void ksym_trace_print_header(struct seq_file *m) +{ + seq_puts(m, + "# TASK-PID CPU# Symbol " + "Type Function\n"); + seq_puts(m, + "# | | | " + " | |\n"); +} + +static enum print_line_t ksym_trace_output(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct ksym_trace_entry *field; + char str[KSYM_SYMBOL_LEN]; + int ret; + + if (entry->type != TRACE_KSYM) + return TRACE_TYPE_UNHANDLED; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, + entry->pid, iter->cpu, (char *)field->addr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + switch (field->type) { + case HW_BREAKPOINT_R: + ret = trace_seq_printf(s, " R "); + break; + case HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " RW "); + break; + default: + return TRACE_TYPE_PARTIAL_LINE; + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + sprint_symbol(str, field->ip); + ret = trace_seq_printf(s, "%s\n", str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +struct tracer ksym_tracer __read_mostly = +{ + .name = "ksym_tracer", + .init = ksym_trace_init, + .reset = ksym_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_ksym, +#endif + .print_header = ksym_trace_print_header, + .print_line = ksym_trace_output +}; + +__init static int init_ksym_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + ksym_filter_entry_count = 0; + + entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ksym_trace_filter' file\n"); + + return register_tracer(&ksym_tracer); +} +device_initcall(init_ksym_trace); + + +#ifdef CONFIG_PROFILE_KSYM_TRACER +static int ksym_tracer_stat_headers(struct seq_file *m) +{ + seq_puts(m, " Access Type "); + seq_puts(m, " Symbol Counter\n"); + seq_puts(m, " ----------- "); + seq_puts(m, " ------ -------\n"); + return 0; +} + +static int ksym_tracer_stat_show(struct seq_file *m, void *v) +{ + struct hlist_node *stat = v; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + + entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + + access_type = entry->type; + + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } + + if (lookup_symbol_name(entry->ksym_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", "<NA>"); + seq_printf(m, " %15lu\n", entry->counter); + + return 0; +} + +static void *ksym_tracer_stat_start(struct tracer_stat *trace) +{ + return ksym_filter_head.first; +} + +static void * +ksym_tracer_stat_next(void *v, int idx) +{ + struct hlist_node *stat = v; + + return stat->next; +} + +static struct tracer_stat ksym_tracer_stats = { + .name = "ksym_tracer", + .stat_start = ksym_tracer_stat_start, + .stat_next = ksym_tracer_stat_next, + .stat_headers = ksym_tracer_stat_headers, + .stat_show = ksym_tracer_stat_show +}; + +__init static int ksym_tracer_stat_init(void) +{ + int ret; + + ret = register_stat_tracer(&ksym_tracer_stats); + if (ret) { + printk(KERN_WARNING "Warning: could not register " + "ksym tracer stats\n"); + return 1; + } + + return 0; +} +fs_initcall(ksym_tracer_stat_init); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index d2cdbabb4ead..dc98309e839a 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: case TRACE_HW_BRANCHES: + case TRACE_KSYM: return 1; } return 0; @@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace, return ret; } #endif /* CONFIG_HW_BRANCH_TRACER */ + +#ifdef CONFIG_KSYM_TRACER +static int ksym_selftest_dummy; + +int +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + ksym_selftest_dummy = 0; + /* Register the read-write tracing request */ + + ret = process_new_ksym_entry("ksym_selftest_dummy", + HW_BREAKPOINT_R | HW_BREAKPOINT_W, + (unsigned long)(&ksym_selftest_dummy)); + + if (ret < 0) { + printk(KERN_CONT "ksym_trace read-write startup test failed\n"); + goto ret_path; + } + /* Perform a read and a write operation over the dummy variable to + * trigger the tracer + */ + if (ksym_selftest_dummy == 0) + ksym_selftest_dummy++; + + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + /* read & write operations - one each is performed on the dummy variable + * triggering two entries in the trace buffer + */ + if (!ret && count != 2) { + printk(KERN_CONT "Ksym tracer startup test failed"); + ret = -1; + } + +ret_path: + return ret; +} +#endif /* CONFIG_KSYM_TRACER */ + diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 51213b0aa81b..9189cbe86079 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -477,11 +477,12 @@ static int sys_prof_refcount_exit; static void prof_syscall_enter(struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; - struct perf_trace_buf *trace_buf; struct syscall_trace_enter *rec; unsigned long flags; + char *trace_buf; char *raw_data; int syscall_nr; + int rctx; int size; int cpu; @@ -505,28 +506,18 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); + trace_buf = rcu_dereference(perf_trace_buf); if (!trace_buf) goto end; - trace_buf = per_cpu_ptr(trace_buf, cpu); - - if (trace_buf->recursion++) - goto end_recursion; - - /* - * Make recursion update visible before entering perf_tp_event - * so that we protect from perf recursions. - */ - barrier(); - - raw_data = trace_buf->buf; + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -539,9 +530,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) (unsigned long *)&rec->args); perf_tp_event(sys_data->enter_id, 0, 1, rec, size); -end_recursion: - trace_buf->recursion--; end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } @@ -588,10 +579,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - struct perf_trace_buf *trace_buf; unsigned long flags; int syscall_nr; + char *trace_buf; char *raw_data; + int rctx; int size; int cpu; @@ -617,28 +609,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); + trace_buf = rcu_dereference(perf_trace_buf); if (!trace_buf) goto end; - trace_buf = per_cpu_ptr(trace_buf, cpu); - - if (trace_buf->recursion++) - goto end_recursion; - - /* - * Make recursion update visible before entering perf_tp_event - * so that we protect from perf recursions. - */ - barrier(); - - raw_data = trace_buf->buf; + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -652,9 +635,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) perf_tp_event(sys_data->exit_id, 0, 1, rec, size); -end_recursion: - trace_buf->recursion--; end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } diff --git a/samples/Kconfig b/samples/Kconfig index b92bde3c6a89..e4be84ac3d38 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -40,5 +40,11 @@ config SAMPLE_KRETPROBES default m depends on SAMPLE_KPROBES && KRETPROBES +config SAMPLE_HW_BREAKPOINT + tristate "Build kernel hardware breakpoint examples -- loadable module only" + depends on HAVE_HW_BREAKPOINT && m + help + This builds kernel hardware breakpoint example modules. + endif # SAMPLES diff --git a/samples/Makefile b/samples/Makefile index 43343a03b1f4..0f15e6d77fd6 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,3 +1,4 @@ # Makefile for Linux samples code -obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ +obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ \ + hw_breakpoint/ diff --git a/samples/hw_breakpoint/Makefile b/samples/hw_breakpoint/Makefile new file mode 100644 index 000000000000..0f5c31c2fc47 --- /dev/null +++ b/samples/hw_breakpoint/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_SAMPLE_HW_BREAKPOINT) += data_breakpoint.o diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c new file mode 100644 index 000000000000..95063818bcf4 --- /dev/null +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -0,0 +1,90 @@ +/* + * data_breakpoint.c - Sample HW Breakpoint file to watch kernel data address + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * usage: insmod data_breakpoint.ko ksym=<ksym_name> + * + * This file is a kernel module that places a breakpoint over ksym_name kernel + * variable using Hardware Breakpoint register. The corresponding handler which + * prints a backtrace is invoked everytime a write operation is performed on + * that variable. + * + * Copyright (C) IBM Corporation, 2009 + * + * Author: K.Prasad <prasad@linux.vnet.ibm.com> + */ +#include <linux/module.h> /* Needed by all modules */ +#include <linux/kernel.h> /* Needed for KERN_INFO */ +#include <linux/init.h> /* Needed for the macros */ +#include <linux/kallsyms.h> + +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> + +struct perf_event **sample_hbp; + +static char ksym_name[KSYM_NAME_LEN] = "pid_max"; +module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); +MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" + " write operations on the kernel symbol"); + +static void sample_hbp_handler(struct perf_event *temp, void *data) +{ + printk(KERN_INFO "%s value is changed\n", ksym_name); + dump_stack(); + printk(KERN_INFO "Dump stack from sample_hbp_handler\n"); +} + +static int __init hw_break_module_init(void) +{ + int ret; + unsigned long addr; + + addr = kallsyms_lookup_name(ksym_name); + + sample_hbp = register_wide_hw_breakpoint(addr, HW_BREAKPOINT_LEN_4, + HW_BREAKPOINT_W | HW_BREAKPOINT_R, + sample_hbp_handler, true); + if (IS_ERR(sample_hbp)) { + ret = PTR_ERR(sample_hbp); + goto fail; + } else if (!sample_hbp) { + ret = -EINVAL; + goto fail; + } + + printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name); + + return 0; + +fail: + printk(KERN_INFO "Breakpoint registration failed\n"); + + return ret; +} + +static void __exit hw_break_module_exit(void) +{ + unregister_wide_hw_breakpoint(sample_hbp); + printk(KERN_INFO "HW Breakpoint for %s write uninstalled\n", ksym_name); +} + +module_init(hw_break_module_init); +module_exit(hw_break_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("K.Prasad"); +MODULE_DESCRIPTION("ksym breakpoint"); diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt new file mode 100644 index 000000000000..44b0ce35c28a --- /dev/null +++ b/tools/perf/Documentation/perf-kmem.txt @@ -0,0 +1,44 @@ +perf-kmem(1) +============== + +NAME +---- +perf-kmem - Tool to trace/measure kernel memory(slab) properties + +SYNOPSIS +-------- +[verse] +'perf kmem' {record} [<options>] + +DESCRIPTION +----------- +There's two variants of perf kmem: + + 'perf kmem record <command>' to record the kmem events + of an arbitrary workload. + + 'perf kmem' to report kernel memory statistics. + +OPTIONS +------- +-i <file>:: +--input=<file>:: + Select the input file (default: perf.data) + +--stat=<caller|alloc>:: + Select per callsite or per allocation statistics + +-s <key[,key2...]>:: +--sort=<key[,key2...]>:: + Sort the output (default: frag,hit,bytes) + +-l <num>:: +--line=<num>:: + Print n lines only + +--raw-ip:: + Print raw ip instead of symbol + +SEE ALSO +-------- +linkperf:perf-record[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 0ff23de9e453..fc46c0b40f6e 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -26,11 +26,19 @@ OPTIONS -e:: --event=:: - Select the PMU event. Selection can be a symbolic event name - (use 'perf list' to list all events) or a raw PMU - event (eventsel+umask) in the form of rNNN where NNN is a - hexadecimal event descriptor. + Select the PMU event. Selection can be: + - a symbolic event name (use 'perf list' to list all events) + + - a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a + hexadecimal event descriptor. + + - a hardware breakpoint event in the form of '\mem:addr[:access]' + where addr is the address in memory you want to break in. + Access is the memory access type (read, write, execute) it can + be passed as follows: '\mem:addr[:[r][w][x]]'. + If you want to profile read-write accesses in 0x1000, just set + 'mem:0x1000:rw'. -a:: System-wide collection. diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 53e663a5fa2f..f1537a94a05f 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -2,6 +2,7 @@ all:: # Define V=1 to have a more verbose compile. +# Define V=2 to have an even more verbose compile. # # Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf() # or vsnprintf() return -1 instead of number of characters which would @@ -147,6 +148,8 @@ all:: # broken, or spawning external process is slower than built-in grep perf has). # # Define LDFLAGS=-static to build a static binary. +# +# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds. PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE @$(SHELL_PATH) util/PERF-VERSION-GEN @@ -159,22 +162,6 @@ uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not') uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not') uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not') -# -# Add -m32 for cross-builds: -# -ifdef NO_64BIT - MBITS := -m32 -else - # - # If we're on a 64-bit kernel (except ia64), use -m64: - # - ifneq ($(uname_M),ia64) - ifneq ($(patsubst %64,%,$(uname_M)),$(uname_M)) - MBITS := -m64 - endif - endif -endif - # CFLAGS and LDFLAGS are for the users to override from the command line. # @@ -211,7 +198,7 @@ ifndef PERF_DEBUG CFLAGS_OPTIMIZE = -O6 endif -CFLAGS = $(MBITS) -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) +CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) EXTLIBS = -lpthread -lrt -lelf -lm ALL_CFLAGS = $(CFLAGS) ALL_LDFLAGS = $(LDFLAGS) @@ -263,7 +250,7 @@ PTHREAD_LIBS = -lpthread # explicitly what architecture to check for. Fix this up for yours.. SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ -ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o /dev/null >/dev/null 2>&1 && echo y"), y) +ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o /dev/null "$(QUIET_STDERR)" && echo y"), y) CFLAGS := $(CFLAGS) -fstack-protector-all endif @@ -445,9 +432,15 @@ BUILTIN_OBJS += builtin-timechart.o BUILTIN_OBJS += builtin-top.o BUILTIN_OBJS += builtin-trace.o BUILTIN_OBJS += builtin-probe.o +BUILTIN_OBJS += builtin-kmem.o PERFLIBS = $(LIB_FILE) +ifeq ($(V), 2) + QUIET_STDERR = ">/dev/null" +else + QUIET_STDERR = ">/dev/null 2>&1" +endif # # Platform specific tweaks # @@ -475,19 +468,19 @@ ifeq ($(uname_S),Darwin) PTHREAD_LIBS = endif -ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y) -ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y) - msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]); +ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y) +ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y) + msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static); endif - ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y) + ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y) BASIC_CFLAGS += -DLIBELF_NO_MMAP endif else msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]); endif -ifneq ($(shell sh -c "(echo '\#include <libdwarf/dwarf.h>'; echo '\#include <libdwarf/libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ldwarf -lelf -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) > /dev/null 2>&1 && echo y"), y) +ifneq ($(shell sh -c "(echo '\#include <libdwarf/dwarf.h>'; echo '\#include <libdwarf/libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ldwarf -lelf -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y) msg := $(warning No libdwarf.h found or old libdwarf.h found, disables dwarf support. Please install libdwarf-dev/libdwarf-devel >= 20081231); BASIC_CFLAGS += -DNO_LIBDWARF else @@ -499,25 +492,25 @@ endif ifdef NO_DEMANGLE BASIC_CFLAGS += -DNO_DEMANGLE else - has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd > /dev/null 2>&1 && echo y") + has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd "$(QUIET_STDERR)" && echo y") ifeq ($(has_bfd),y) EXTLIBS += -lbfd else - has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty > /dev/null 2>&1 && echo y") + has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty "$(QUIET_STDERR)" && echo y") ifeq ($(has_bfd_iberty),y) EXTLIBS += -lbfd -liberty else - has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") + has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty -lz "$(QUIET_STDERR)" && echo y") ifeq ($(has_bfd_iberty_z),y) EXTLIBS += -lbfd -liberty -lz else - has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -liberty > /dev/null 2>&1 && echo y") + has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) $(EXTLIBS) -liberty "$(QUIET_STDERR)" && echo y") ifeq ($(has_cplus_demangle),y) EXTLIBS += -liberty BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE else - msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) + msg := $(warning No bfd.h/libbfd found, install binutils-dev[el]/zlib-static to gain symbol demangling) BASIC_CFLAGS += -DNO_DEMANGLE endif endif diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 77d50a6d6802..6b13a1ecf1e7 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -33,9 +33,11 @@ static int input; static int full_paths; static int print_line; +static bool use_modules; static unsigned long page_size; static unsigned long mmap_window = 32; +const char *vmlinux_name; struct sym_hist { u64 sum; @@ -156,7 +158,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) if (event->header.misc & PERF_RECORD_MISC_KERNEL) { level = 'k'; - sym = kernel_maps__find_symbol(ip, &map); + sym = kernel_maps__find_symbol(ip, &map, symbol_filter); dump_printf(" ...... dso: %s\n", map ? map->dso->long_name : "<not found>"); } else if (event->header.misc & PERF_RECORD_MISC_USER) { @@ -636,9 +638,9 @@ static int __cmd_annotate(void) exit(0); } - if (load_kernel(symbol_filter) < 0) { - perror("failed to load kernel symbols"); - return EXIT_FAILURE; + if (kernel_maps__init(vmlinux_name, true, use_modules) < 0) { + pr_err("failed to create kernel maps for symbol resolution\b"); + return -1; } remap: @@ -742,7 +744,7 @@ static const struct option options[] = { OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"), - OPT_BOOLEAN('m', "modules", &modules, + OPT_BOOLEAN('m', "modules", &use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), OPT_BOOLEAN('l', "print-line", &print_line, "print matching source lines (may be slow)"), diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c new file mode 100644 index 000000000000..173d6db42ecb --- /dev/null +++ b/tools/perf/builtin-kmem.c @@ -0,0 +1,833 @@ +#include "builtin.h" +#include "perf.h" + +#include "util/util.h" +#include "util/cache.h" +#include "util/symbol.h" +#include "util/thread.h" +#include "util/header.h" + +#include "util/parse-options.h" +#include "util/trace-event.h" + +#include "util/debug.h" +#include "util/data_map.h" + +#include <linux/rbtree.h> + +struct alloc_stat; +typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *); + +static char const *input_name = "perf.data"; + +static struct perf_header *header; +static u64 sample_type; + +static int alloc_flag; +static int caller_flag; + +static int alloc_lines = -1; +static int caller_lines = -1; + +static bool raw_ip; + +static char default_sort_order[] = "frag,hit,bytes"; + +static char *cwd; +static int cwdlen; + +static int *cpunode_map; +static int max_cpu_num; + +struct alloc_stat { + u64 call_site; + u64 ptr; + u64 bytes_req; + u64 bytes_alloc; + u32 hit; + u32 pingpong; + + short alloc_cpu; + + struct rb_node node; +}; + +static struct rb_root root_alloc_stat; +static struct rb_root root_alloc_sorted; +static struct rb_root root_caller_stat; +static struct rb_root root_caller_sorted; + +static unsigned long total_requested, total_allocated; +static unsigned long nr_allocs, nr_cross_allocs; + +struct raw_event_sample { + u32 size; + char data[0]; +}; + +#define PATH_SYS_NODE "/sys/devices/system/node" + +static void init_cpunode_map(void) +{ + FILE *fp; + int i; + + fp = fopen("/sys/devices/system/cpu/kernel_max", "r"); + if (!fp) { + max_cpu_num = 4096; + return; + } + + if (fscanf(fp, "%d", &max_cpu_num) < 1) + die("Failed to read 'kernel_max' from sysfs"); + max_cpu_num++; + + cpunode_map = calloc(max_cpu_num, sizeof(int)); + if (!cpunode_map) + die("calloc"); + for (i = 0; i < max_cpu_num; i++) + cpunode_map[i] = -1; + fclose(fp); +} + +static void setup_cpunode_map(void) +{ + struct dirent *dent1, *dent2; + DIR *dir1, *dir2; + unsigned int cpu, mem; + char buf[PATH_MAX]; + + init_cpunode_map(); + + dir1 = opendir(PATH_SYS_NODE); + if (!dir1) + return; + + while (true) { + dent1 = readdir(dir1); + if (!dent1) + break; + + if (sscanf(dent1->d_name, "node%u", &mem) < 1) + continue; + + snprintf(buf, PATH_MAX, "%s/%s", PATH_SYS_NODE, dent1->d_name); + dir2 = opendir(buf); + if (!dir2) + continue; + while (true) { + dent2 = readdir(dir2); + if (!dent2) + break; + if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1) + continue; + cpunode_map[cpu] = mem; + } + } +} + +static int +process_comm_event(event_t *event, unsigned long offset, unsigned long head) +{ + struct thread *thread = threads__findnew(event->comm.pid); + + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", + (void *)(offset + head), + (void *)(long)(event->header.size), + event->comm.comm, event->comm.pid); + + if (thread == NULL || + thread__set_comm(thread, event->comm.comm)) { + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); + return -1; + } + + return 0; +} + +static void insert_alloc_stat(unsigned long call_site, unsigned long ptr, + int bytes_req, int bytes_alloc, int cpu) +{ + struct rb_node **node = &root_alloc_stat.rb_node; + struct rb_node *parent = NULL; + struct alloc_stat *data = NULL; + + while (*node) { + parent = *node; + data = rb_entry(*node, struct alloc_stat, node); + + if (ptr > data->ptr) + node = &(*node)->rb_right; + else if (ptr < data->ptr) + node = &(*node)->rb_left; + else + break; + } + + if (data && data->ptr == ptr) { + data->hit++; + data->bytes_req += bytes_req; + data->bytes_alloc += bytes_req; + } else { + data = malloc(sizeof(*data)); + if (!data) + die("malloc"); + data->ptr = ptr; + data->pingpong = 0; + data->hit = 1; + data->bytes_req = bytes_req; + data->bytes_alloc = bytes_alloc; + + rb_link_node(&data->node, parent, node); + rb_insert_color(&data->node, &root_alloc_stat); + } + data->call_site = call_site; + data->alloc_cpu = cpu; +} + +static void insert_caller_stat(unsigned long call_site, + int bytes_req, int bytes_alloc) +{ + struct rb_node **node = &root_caller_stat.rb_node; + struct rb_node *parent = NULL; + struct alloc_stat *data = NULL; + + while (*node) { + parent = *node; + data = rb_entry(*node, struct alloc_stat, node); + + if (call_site > data->call_site) + node = &(*node)->rb_right; + else if (call_site < data->call_site) + node = &(*node)->rb_left; + else + break; + } + + if (data && data->call_site == call_site) { + data->hit++; + data->bytes_req += bytes_req; + data->bytes_alloc += bytes_req; + } else { + data = malloc(sizeof(*data)); + if (!data) + die("malloc"); + data->call_site = call_site; + data->pingpong = 0; + data->hit = 1; + data->bytes_req = bytes_req; + data->bytes_alloc = bytes_alloc; + + rb_link_node(&data->node, parent, node); + rb_insert_color(&data->node, &root_caller_stat); + } +} + +static void process_alloc_event(struct raw_event_sample *raw, + struct event *event, + int cpu, + u64 timestamp __used, + struct thread *thread __used, + int node) +{ + unsigned long call_site; + unsigned long ptr; + int bytes_req; + int bytes_alloc; + int node1, node2; + + ptr = raw_field_value(event, "ptr", raw->data); + call_site = raw_field_value(event, "call_site", raw->data); + bytes_req = raw_field_value(event, "bytes_req", raw->data); + bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data); + + insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, cpu); + insert_caller_stat(call_site, bytes_req, bytes_alloc); + + total_requested += bytes_req; + total_allocated += bytes_alloc; + + if (node) { + node1 = cpunode_map[cpu]; + node2 = raw_field_value(event, "node", raw->data); + if (node1 != node2) + nr_cross_allocs++; + } + nr_allocs++; +} + +static int ptr_cmp(struct alloc_stat *, struct alloc_stat *); +static int callsite_cmp(struct alloc_stat *, struct alloc_stat *); + +static struct alloc_stat *search_alloc_stat(unsigned long ptr, + unsigned long call_site, + struct rb_root *root, + sort_fn_t sort_fn) +{ + struct rb_node *node = root->rb_node; + struct alloc_stat key = { .ptr = ptr, .call_site = call_site }; + + while (node) { + struct alloc_stat *data; + int cmp; + + data = rb_entry(node, struct alloc_stat, node); + + cmp = sort_fn(&key, data); + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else + return data; + } + return NULL; +} + +static void process_free_event(struct raw_event_sample *raw, + struct event *event, + int cpu, + u64 timestamp __used, + struct thread *thread __used) +{ + unsigned long ptr; + struct alloc_stat *s_alloc, *s_caller; + + ptr = raw_field_value(event, "ptr", raw->data); + + s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp); + if (!s_alloc) + return; + + if (cpu != s_alloc->alloc_cpu) { + s_alloc->pingpong++; + + s_caller = search_alloc_stat(0, s_alloc->call_site, + &root_caller_stat, callsite_cmp); + assert(s_caller); + s_caller->pingpong++; + } + s_alloc->alloc_cpu = -1; +} + +static void +process_raw_event(event_t *raw_event __used, void *more_data, + int cpu, u64 timestamp, struct thread *thread) +{ + struct raw_event_sample *raw = more_data; + struct event *event; + int type; + + type = trace_parse_common_type(raw->data); + event = trace_find_event(type); + + if (!strcmp(event->name, "kmalloc") || + !strcmp(event->name, "kmem_cache_alloc")) { + process_alloc_event(raw, event, cpu, timestamp, thread, 0); + return; + } + + if (!strcmp(event->name, "kmalloc_node") || + !strcmp(event->name, "kmem_cache_alloc_node")) { + process_alloc_event(raw, event, cpu, timestamp, thread, 1); + return; + } + + if (!strcmp(event->name, "kfree") || + !strcmp(event->name, "kmem_cache_free")) { + process_free_event(raw, event, cpu, timestamp, thread); + return; + } +} + +static int +process_sample_event(event_t *event, unsigned long offset, unsigned long head) +{ + u64 ip = event->ip.ip; + u64 timestamp = -1; + u32 cpu = -1; + u64 period = 1; + void *more_data = event->ip.__more_data; + struct thread *thread = threads__findnew(event->ip.pid); + + if (sample_type & PERF_SAMPLE_TIME) { + timestamp = *(u64 *)more_data; + more_data += sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_CPU) { + cpu = *(u32 *)more_data; + more_data += sizeof(u32); + more_data += sizeof(u32); /* reserved */ + } + + if (sample_type & PERF_SAMPLE_PERIOD) { + period = *(u64 *)more_data; + more_data += sizeof(u64); + } + + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + (void *)(offset + head), + (void *)(long)(event->header.size), + event->header.misc, + event->ip.pid, event->ip.tid, + (void *)(long)ip, + (long long)period); + + if (thread == NULL) { + pr_debug("problem processing %d event, skipping it.\n", + event->header.type); + return -1; + } + + dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); + + process_raw_event(event, more_data, cpu, timestamp, thread); + + return 0; +} + +static int sample_type_check(u64 type) +{ + sample_type = type; + + if (!(sample_type & PERF_SAMPLE_RAW)) { + fprintf(stderr, + "No trace sample to read. Did you call perf record " + "without -R?"); + return -1; + } + + return 0; +} + +static struct perf_file_handler file_handler = { + .process_sample_event = process_sample_event, + .process_comm_event = process_comm_event, + .sample_type_check = sample_type_check, +}; + +static int read_events(void) +{ + register_idle_thread(); + register_perf_file_handler(&file_handler); + + return mmap_dispatch_perf_file(&header, input_name, NULL, false, 0, 0, + &cwdlen, &cwd); +} + +static double fragmentation(unsigned long n_req, unsigned long n_alloc) +{ + if (n_alloc == 0) + return 0.0; + else + return 100.0 - (100.0 * n_req / n_alloc); +} + +static void __print_result(struct rb_root *root, int n_lines, int is_caller) +{ + struct rb_node *next; + + printf("%.102s\n", graph_dotted_line); + printf(" %-34s |", is_caller ? "Callsite": "Alloc Ptr"); + printf(" Total_alloc/Per | Total_req/Per | Hit | Ping-pong | Frag\n"); + printf("%.102s\n", graph_dotted_line); + + next = rb_first(root); + + while (next && n_lines--) { + struct alloc_stat *data = rb_entry(next, struct alloc_stat, + node); + struct symbol *sym = NULL; + char buf[BUFSIZ]; + u64 addr; + + if (is_caller) { + addr = data->call_site; + if (!raw_ip) + sym = kernel_maps__find_symbol(addr, + NULL, NULL); + } else + addr = data->ptr; + + if (sym != NULL) + snprintf(buf, sizeof(buf), "%s+%Lx", sym->name, + addr - sym->start); + else + snprintf(buf, sizeof(buf), "%#Lx", addr); + printf(" %-34s |", buf); + + printf(" %9llu/%-5lu | %9llu/%-5lu | %6lu | %8lu | %6.3f%%\n", + (unsigned long long)data->bytes_alloc, + (unsigned long)data->bytes_alloc / data->hit, + (unsigned long long)data->bytes_req, + (unsigned long)data->bytes_req / data->hit, + (unsigned long)data->hit, + (unsigned long)data->pingpong, + fragmentation(data->bytes_req, data->bytes_alloc)); + + next = rb_next(next); + } + + if (n_lines == -1) + printf(" ... | ... | ... | ... | ... | ... \n"); + + printf("%.102s\n", graph_dotted_line); +} + +static void print_summary(void) +{ + printf("\nSUMMARY\n=======\n"); + printf("Total bytes requested: %lu\n", total_requested); + printf("Total bytes allocated: %lu\n", total_allocated); + printf("Total bytes wasted on internal fragmentation: %lu\n", + total_allocated - total_requested); + printf("Internal fragmentation: %f%%\n", + fragmentation(total_requested, total_allocated)); + printf("Cross CPU allocations: %lu/%lu\n", nr_cross_allocs, nr_allocs); +} + +static void print_result(void) +{ + if (caller_flag) + __print_result(&root_caller_sorted, caller_lines, 1); + if (alloc_flag) + __print_result(&root_alloc_sorted, alloc_lines, 0); + print_summary(); +} + +struct sort_dimension { + const char name[20]; + sort_fn_t cmp; + struct list_head list; +}; + +static LIST_HEAD(caller_sort); +static LIST_HEAD(alloc_sort); + +static void sort_insert(struct rb_root *root, struct alloc_stat *data, + struct list_head *sort_list) +{ + struct rb_node **new = &(root->rb_node); + struct rb_node *parent = NULL; + struct sort_dimension *sort; + + while (*new) { + struct alloc_stat *this; + int cmp = 0; + + this = rb_entry(*new, struct alloc_stat, node); + parent = *new; + + list_for_each_entry(sort, sort_list, list) { + cmp = sort->cmp(data, this); + if (cmp) + break; + } + + if (cmp > 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + +static void __sort_result(struct rb_root *root, struct rb_root *root_sorted, + struct list_head *sort_list) +{ + struct rb_node *node; + struct alloc_stat *data; + + for (;;) { + node = rb_first(root); + if (!node) + break; + + rb_erase(node, root); + data = rb_entry(node, struct alloc_stat, node); + sort_insert(root_sorted, data, sort_list); + } +} + +static void sort_result(void) +{ + __sort_result(&root_alloc_stat, &root_alloc_sorted, &alloc_sort); + __sort_result(&root_caller_stat, &root_caller_sorted, &caller_sort); +} + +static int __cmd_kmem(void) +{ + setup_pager(); + read_events(); + sort_result(); + print_result(); + + return 0; +} + +static const char * const kmem_usage[] = { + "perf kmem [<options>] {record}", + NULL +}; + +static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r) +{ + if (l->ptr < r->ptr) + return -1; + else if (l->ptr > r->ptr) + return 1; + return 0; +} + +static struct sort_dimension ptr_sort_dimension = { + .name = "ptr", + .cmp = ptr_cmp, +}; + +static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r) +{ + if (l->call_site < r->call_site) + return -1; + else if (l->call_site > r->call_site) + return 1; + return 0; +} + +static struct sort_dimension callsite_sort_dimension = { + .name = "callsite", + .cmp = callsite_cmp, +}; + +static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r) +{ + if (l->hit < r->hit) + return -1; + else if (l->hit > r->hit) + return 1; + return 0; +} + +static struct sort_dimension hit_sort_dimension = { + .name = "hit", + .cmp = hit_cmp, +}; + +static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r) +{ + if (l->bytes_alloc < r->bytes_alloc) + return -1; + else if (l->bytes_alloc > r->bytes_alloc) + return 1; + return 0; +} + +static struct sort_dimension bytes_sort_dimension = { + .name = "bytes", + .cmp = bytes_cmp, +}; + +static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r) +{ + double x, y; + + x = fragmentation(l->bytes_req, l->bytes_alloc); + y = fragmentation(r->bytes_req, r->bytes_alloc); + + if (x < y) + return -1; + else if (x > y) + return 1; + return 0; +} + +static struct sort_dimension frag_sort_dimension = { + .name = "frag", + .cmp = frag_cmp, +}; + +static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r) +{ + if (l->pingpong < r->pingpong) + return -1; + else if (l->pingpong > r->pingpong) + return 1; + return 0; +} + +static struct sort_dimension pingpong_sort_dimension = { + .name = "pingpong", + .cmp = pingpong_cmp, +}; + +static struct sort_dimension *avail_sorts[] = { + &ptr_sort_dimension, + &callsite_sort_dimension, + &hit_sort_dimension, + &bytes_sort_dimension, + &frag_sort_dimension, + &pingpong_sort_dimension, +}; + +#define NUM_AVAIL_SORTS \ + (int)(sizeof(avail_sorts) / sizeof(struct sort_dimension *)) + +static int sort_dimension__add(const char *tok, struct list_head *list) +{ + struct sort_dimension *sort; + int i; + + for (i = 0; i < NUM_AVAIL_SORTS; i++) { + if (!strcmp(avail_sorts[i]->name, tok)) { + sort = malloc(sizeof(*sort)); + if (!sort) + die("malloc"); + memcpy(sort, avail_sorts[i], sizeof(*sort)); + list_add_tail(&sort->list, list); + return 0; + } + } + + return -1; +} + +static int setup_sorting(struct list_head *sort_list, const char *arg) +{ + char *tok; + char *str = strdup(arg); + + if (!str) + die("strdup"); + + while (true) { + tok = strsep(&str, ","); + if (!tok) + break; + if (sort_dimension__add(tok, sort_list) < 0) { + error("Unknown --sort key: '%s'", tok); + return -1; + } + } + + free(str); + return 0; +} + +static int parse_sort_opt(const struct option *opt __used, + const char *arg, int unset __used) +{ + if (!arg) + return -1; + + if (caller_flag > alloc_flag) + return setup_sorting(&caller_sort, arg); + else + return setup_sorting(&alloc_sort, arg); + + return 0; +} + +static int parse_stat_opt(const struct option *opt __used, + const char *arg, int unset __used) +{ + if (!arg) + return -1; + + if (strcmp(arg, "alloc") == 0) + alloc_flag = (caller_flag + 1); + else if (strcmp(arg, "caller") == 0) + caller_flag = (alloc_flag + 1); + else + return -1; + return 0; +} + +static int parse_line_opt(const struct option *opt __used, + const char *arg, int unset __used) +{ + int lines; + + if (!arg) + return -1; + + lines = strtoul(arg, NULL, 10); + + if (caller_flag > alloc_flag) + caller_lines = lines; + else + alloc_lines = lines; + + return 0; +} + +static const struct option kmem_options[] = { + OPT_STRING('i', "input", &input_name, "file", + "input file name"), + OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>", + "stat selector, Pass 'alloc' or 'caller'.", + parse_stat_opt), + OPT_CALLBACK('s', "sort", NULL, "key[,key2...]", + "sort by keys: ptr, call_site, bytes, hit, pingpong, frag", + parse_sort_opt), + OPT_CALLBACK('l', "line", NULL, "num", + "show n lins", + parse_line_opt), + OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"), + OPT_END() +}; + +static const char *record_args[] = { + "record", + "-a", + "-R", + "-M", + "-f", + "-c", "1", + "-e", "kmem:kmalloc", + "-e", "kmem:kmalloc_node", + "-e", "kmem:kfree", + "-e", "kmem:kmem_cache_alloc", + "-e", "kmem:kmem_cache_alloc_node", + "-e", "kmem:kmem_cache_free", +}; + +static int __cmd_record(int argc, const char **argv) +{ + unsigned int rec_argc, i, j; + const char **rec_argv; + + rec_argc = ARRAY_SIZE(record_args) + argc - 1; + rec_argv = calloc(rec_argc + 1, sizeof(char *)); + + for (i = 0; i < ARRAY_SIZE(record_args); i++) + rec_argv[i] = strdup(record_args[i]); + + for (j = 1; j < (unsigned int)argc; j++, i++) + rec_argv[i] = argv[j]; + + return cmd_record(i, rec_argv, NULL); +} + +int cmd_kmem(int argc, const char **argv, const char *prefix __used) +{ + symbol__init(0); + + argc = parse_options(argc, argv, kmem_options, kmem_usage, 0); + + if (argc && !strncmp(argv[0], "rec", 3)) + return __cmd_record(argc, argv); + else if (argc) + usage_with_options(kmem_usage, kmem_options); + + if (list_empty(&caller_sort)) + setup_sorting(&caller_sort, default_sort_order); + if (list_empty(&alloc_sort)) + setup_sorting(&alloc_sort, default_sort_order); + + setup_cpunode_map(); + + return __cmd_kmem(); +} + diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 82260c56db3d..0e519c667e3a 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -307,6 +307,12 @@ try_again: printf("\n"); error("perfcounter syscall returned with %d (%s)\n", fd[nr_cpu][counter], strerror(err)); + +#if defined(__i386__) || defined(__x86_64__) + if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP) + die("No hardware sampling interrupt available. No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.\n"); +#endif + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } @@ -400,7 +406,7 @@ static int __cmd_record(int argc, const char **argv) struct stat st; pid_t pid = 0; int flags; - int ret; + int err; unsigned long waking = 0; page_size = sysconf(_SC_PAGE_SIZE); @@ -434,16 +440,18 @@ static int __cmd_record(int argc, const char **argv) exit(-1); } - if (!file_new) - header = perf_header__read(output); - else - header = perf_header__new(); - + header = perf_header__new(); if (header == NULL) { pr_err("Not enough memory for reading perf file header\n"); return -1; } + if (!file_new) { + err = perf_header__read(header, output); + if (err < 0) + return err; + } + if (raw_samples) { perf_header__set_feat(header, HEADER_TRACE_INFO); } else { @@ -472,8 +480,11 @@ static int __cmd_record(int argc, const char **argv) } } - if (file_new) - perf_header__write(header, output, false); + if (file_new) { + err = perf_header__write(header, output, false); + if (err < 0) + return err; + } if (!system_wide) event__synthesize_thread(pid, process_synthesized_event); @@ -527,7 +538,7 @@ static int __cmd_record(int argc, const char **argv) if (hits == samples) { if (done) break; - ret = poll(event_array, nr_poll, -1); + err = poll(event_array, nr_poll, -1); waking++; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 1a806d5f05cf..fe474b7f8ad0 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -38,6 +38,7 @@ static char *dso_list_str, *comm_list_str, *sym_list_str, static struct strlist *dso_list, *comm_list, *sym_list; static int force; +static bool use_modules; static int full_paths; static int show_nr_samples; @@ -51,6 +52,7 @@ static char *pretty_printing_style = default_pretty_printing_style; static int exclude_other = 1; static char callchain_default_opt[] = "fractal,0.5"; +const char *vmlinux_name; static char *cwd; static int cwdlen; @@ -448,7 +450,7 @@ got_map: * trick of looking in the whole kernel symbol list. */ if ((long long)ip < 0) - return kernel_maps__find_symbol(ip, mapp); + return kernel_maps__find_symbol(ip, mapp, NULL); } dump_printf(" ...... dso: %s\n", map ? map->dso->long_name : "<not found>"); @@ -466,7 +468,7 @@ static int call__match(struct symbol *sym) return 0; } -static struct symbol **resolve_callchain(struct thread *thread, struct map *map, +static struct symbol **resolve_callchain(struct thread *thread, struct ip_callchain *chain, struct symbol **parent) { @@ -495,10 +497,10 @@ static struct symbol **resolve_callchain(struct thread *thread, struct map *map, case PERF_CONTEXT_HV: break; case PERF_CONTEXT_KERNEL: - sym = kernel_maps__find_symbol(ip, &map); + sym = kernel_maps__find_symbol(ip, NULL, NULL); break; default: - sym = resolve_symbol(thread, &map, &ip); + sym = resolve_symbol(thread, NULL, &ip); break; } @@ -528,7 +530,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct hist_entry *he; if ((sort__has_parent || callchain) && chain) - syms = resolve_callchain(thread, map, chain, &parent); + syms = resolve_callchain(thread, chain, &parent); he = __hist_entry__add(thread, map, sym, parent, ip, count, level, &hit); @@ -715,7 +717,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) if (cpumode == PERF_RECORD_MISC_KERNEL) { level = 'k'; - sym = kernel_maps__find_symbol(ip, &map); + sym = kernel_maps__find_symbol(ip, &map, NULL); dump_printf(" ...... dso: %s\n", map ? map->dso->long_name : "<not found>"); } else if (cpumode == PERF_RECORD_MISC_USER) { @@ -924,8 +926,9 @@ static int __cmd_report(void) register_perf_file_handler(&file_handler); - ret = mmap_dispatch_perf_file(&header, input_name, force, full_paths, - &cwdlen, &cwd); + ret = mmap_dispatch_perf_file(&header, input_name, vmlinux_name, + !vmlinux_name, force, + full_paths, &cwdlen, &cwd); if (ret) return ret; @@ -1023,7 +1026,7 @@ static const struct option options[] = { "dump raw trace in ASCII"), OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"), OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), - OPT_BOOLEAN('m', "modules", &modules, + OPT_BOOLEAN('m', "modules", &use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples, "Show a column with the number of samples"), diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index df44b756cecc..260f57a72ee0 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1718,7 +1718,8 @@ static int read_events(void) register_idle_thread(); register_perf_file_handler(&file_handler); - return mmap_dispatch_perf_file(&header, input_name, 0, 0, &cwdlen, &cwd); + return mmap_dispatch_perf_file(&header, input_name, NULL, false, 0, 0, + &cwdlen, &cwd); } static void print_bad_events(void) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 665877e4a944..dd4d82ac7aa4 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -1093,7 +1093,7 @@ static void process_samples(void) static int __cmd_timechart(void) { - int ret, rc = EXIT_FAILURE; + int err, rc = EXIT_FAILURE; unsigned long offset = 0; unsigned long head, shift; struct stat statbuf; @@ -1111,8 +1111,8 @@ static int __cmd_timechart(void) exit(-1); } - ret = fstat(input, &statbuf); - if (ret < 0) { + err = fstat(input, &statbuf); + if (err < 0) { perror("failed to stat file"); exit(-1); } @@ -1122,7 +1122,16 @@ static int __cmd_timechart(void) exit(0); } - header = perf_header__read(input); + header = perf_header__new(); + if (header == NULL) + return -ENOMEM; + + err = perf_header__read(header, input); + if (err < 0) { + perf_header__delete(header); + return err; + } + head = header->data_offset; sample_type = perf_header__sample_type(header); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 07b92c378ae2..6a5de90e9b83 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -79,13 +79,7 @@ static int dump_symtab = 0; static bool hide_kernel_symbols = false; static bool hide_user_symbols = false; static struct winsize winsize; -static const char *graph_line = - "_____________________________________________________________________" - "_____________________________________________________________________"; -static const char *graph_dotted_line = - "---------------------------------------------------------------------" - "---------------------------------------------------------------------" - "---------------------------------------------------------------------"; +const char *vmlinux_name; /* * Source @@ -830,6 +824,8 @@ static void handle_keypress(int c) case 'q': case 'Q': printf("exiting.\n"); + if (dump_symtab) + dsos__fprintf(stderr); exit(0); case 's': prompt_symbol(&sym_filter_entry, "Enter details symbol"); @@ -946,17 +942,6 @@ static int symbol_filter(struct map *map, struct symbol *sym) return 0; } -static int parse_symbols(void) -{ - if (dsos__load_kernel(vmlinux_name, symbol_filter, 1) <= 0) - return -1; - - if (dump_symtab) - dsos__fprintf(stderr); - - return 0; -} - static void event__process_sample(const event_t *self, int counter) { u64 ip = self->ip.ip; @@ -999,7 +984,7 @@ static void event__process_sample(const event_t *self, int counter) if (hide_kernel_symbols) return; - sym = kernel_maps__find_symbol(ip, &map); + sym = kernel_maps__find_symbol(ip, &map, symbol_filter); if (sym == NULL) return; break; @@ -1326,7 +1311,7 @@ static const struct option options[] = { int cmd_top(int argc, const char **argv, const char *prefix __used) { - int counter; + int counter, err; page_size = sysconf(_SC_PAGE_SIZE); @@ -1350,10 +1335,11 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) if (delay_secs < 1) delay_secs = 1; - parse_symbols(); + err = kernel_maps__init(vmlinux_name, !vmlinux_name, true); + if (err < 0) + return err; parse_source(sym_filter_entry); - /* * User specified count overrides default frequency. */ diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d042d656c561..b71198e5dc14 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -131,7 +131,8 @@ static int __cmd_trace(void) register_idle_thread(); register_perf_file_handler(&file_handler); - return mmap_dispatch_perf_file(&header, input_name, 0, 0, &cwdlen, &cwd); + return mmap_dispatch_perf_file(&header, input_name, NULL, false, + 0, 0, &cwdlen, &cwd); } static const char * const annotate_usage[] = { diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h index 9b02d85091fe..a3d8bf65f26c 100644 --- a/tools/perf/builtin.h +++ b/tools/perf/builtin.h @@ -28,5 +28,6 @@ extern int cmd_top(int argc, const char **argv, const char *prefix); extern int cmd_trace(int argc, const char **argv, const char *prefix); extern int cmd_version(int argc, const char **argv, const char *prefix); extern int cmd_probe(int argc, const char **argv, const char *prefix); +extern int cmd_kmem(int argc, const char **argv, const char *prefix); #endif diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt index d3a6e18e4a5e..02b09ea17a3e 100644 --- a/tools/perf/command-list.txt +++ b/tools/perf/command-list.txt @@ -14,3 +14,4 @@ perf-timechart mainporcelain common perf-top mainporcelain common perf-trace mainporcelain common perf-probe mainporcelain common +perf-kmem mainporcelain common diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 89b82acac7d9..cf64049bc9bd 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -285,20 +285,21 @@ static void handle_internal_command(int argc, const char **argv) { const char *cmd = argv[0]; static struct cmd_struct commands[] = { - { "help", cmd_help, 0 }, - { "list", cmd_list, 0 }, { "buildid-list", cmd_buildid_list, 0 }, - { "record", cmd_record, 0 }, - { "report", cmd_report, 0 }, - { "bench", cmd_bench, 0 }, - { "stat", cmd_stat, 0 }, - { "timechart", cmd_timechart, 0 }, - { "top", cmd_top, 0 }, - { "annotate", cmd_annotate, 0 }, - { "version", cmd_version, 0 }, - { "trace", cmd_trace, 0 }, - { "sched", cmd_sched, 0 }, - { "probe", cmd_probe, 0 }, + { "help", cmd_help, 0 }, + { "list", cmd_list, 0 }, + { "record", cmd_record, 0 }, + { "report", cmd_report, 0 }, + { "bench", cmd_bench, 0 }, + { "stat", cmd_stat, 0 }, + { "timechart", cmd_timechart, 0 }, + { "top", cmd_top, 0 }, + { "annotate", cmd_annotate, 0 }, + { "version", cmd_version, 0 }, + { "trace", cmd_trace, 0 }, + { "sched", cmd_sched, 0 }, + { "probe", cmd_probe, 0 }, + { "kmem", cmd_kmem, 0 }, }; unsigned int i; static const char ext[] = STRIP_EXTENSION; diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c index 0b791bd346bc..35073621e5de 100644 --- a/tools/perf/util/ctype.c +++ b/tools/perf/util/ctype.c @@ -29,3 +29,11 @@ unsigned char sane_ctype[256] = { A, A, A, A, A, A, A, A, A, A, A, R, R, P, P, 0, /* 112..127 */ /* Nothing in the 128.. range */ }; + +const char *graph_line = + "_____________________________________________________________________" + "_____________________________________________________________________"; +const char *graph_dotted_line = + "---------------------------------------------------------------------" + "---------------------------------------------------------------------" + "---------------------------------------------------------------------"; diff --git a/tools/perf/util/data_map.c b/tools/perf/util/data_map.c index 14cb8465eb08..f318d19b2562 100644 --- a/tools/perf/util/data_map.c +++ b/tools/perf/util/data_map.c @@ -101,12 +101,14 @@ out: int mmap_dispatch_perf_file(struct perf_header **pheader, const char *input_name, + const char *vmlinux_name, + bool try_vmlinux_path, int force, int full_paths, int *cwdlen, char **cwd) { - int ret, rc = EXIT_FAILURE; + int err; struct perf_header *header; unsigned long head, shift; unsigned long offset = 0; @@ -118,56 +120,69 @@ int mmap_dispatch_perf_file(struct perf_header **pheader, int input; char *buf; - if (!curr_handler) - die("Forgot to register perf file handler"); + if (curr_handler == NULL) { + pr_debug("Forgot to register perf file handler\n"); + return -EINVAL; + } page_size = getpagesize(); input = open(input_name, O_RDONLY); if (input < 0) { - fprintf(stderr, " failed to open file: %s", input_name); + pr_err("Failed to open file: %s", input_name); if (!strcmp(input_name, "perf.data")) - fprintf(stderr, " (try 'perf record' first)"); - fprintf(stderr, "\n"); - exit(-1); + pr_err(" (try 'perf record' first)"); + pr_err("\n"); + return -errno; } - ret = fstat(input, &input_stat); - if (ret < 0) { - perror("failed to stat file"); - exit(-1); + if (fstat(input, &input_stat) < 0) { + pr_err("failed to stat file"); + err = -errno; + goto out_close; } + err = -EACCES; if (!force && input_stat.st_uid && (input_stat.st_uid != geteuid())) { - fprintf(stderr, "file: %s not owned by current user or root\n", + pr_err("file: %s not owned by current user or root\n", input_name); - exit(-1); + goto out_close; } - if (!input_stat.st_size) { - fprintf(stderr, "zero-sized file, nothing to do!\n"); - exit(0); + if (input_stat.st_size == 0) { + pr_info("zero-sized file, nothing to do!\n"); + goto done; } - *pheader = perf_header__read(input); - header = *pheader; + err = -ENOMEM; + header = perf_header__new(); + if (header == NULL) + goto out_close; + + err = perf_header__read(header, input); + if (err < 0) + goto out_delete; + *pheader = header; head = header->data_offset; sample_type = perf_header__sample_type(header); - if (curr_handler->sample_type_check) - if (curr_handler->sample_type_check(sample_type) < 0) - exit(-1); + err = -EINVAL; + if (curr_handler->sample_type_check && + curr_handler->sample_type_check(sample_type) < 0) + goto out_delete; - if (load_kernel(NULL) < 0) { - perror("failed to load kernel symbols"); - return EXIT_FAILURE; + err = -ENOMEM; + if (kernel_maps__init(vmlinux_name, try_vmlinux_path, true) < 0) { + pr_err("failed to setup the kernel maps to resolve symbols\n"); + goto out_delete; } if (!full_paths) { if (getcwd(__cwd, sizeof(__cwd)) == NULL) { - perror("failed to get the current directory"); - return EXIT_FAILURE; + pr_err("failed to get the current directory\n"); + err = -errno; + goto out_delete; } *cwd = __cwd; *cwdlen = strlen(*cwd); @@ -181,11 +196,12 @@ int mmap_dispatch_perf_file(struct perf_header **pheader, head -= shift; remap: - buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ, - MAP_SHARED, input, offset); + buf = mmap(NULL, page_size * mmap_window, PROT_READ, + MAP_SHARED, input, offset); if (buf == MAP_FAILED) { - perror("failed to mmap file"); - exit(-1); + pr_err("failed to mmap file\n"); + err = -errno; + goto out_delete; } more: @@ -242,10 +258,12 @@ more: goto more; done: - rc = EXIT_SUCCESS; + err = 0; +out_close: close(input); - return rc; + return err; +out_delete: + perf_header__delete(header); + goto out_close; } - - diff --git a/tools/perf/util/data_map.h b/tools/perf/util/data_map.h index ae036ecd7625..3f0d21b3819e 100644 --- a/tools/perf/util/data_map.h +++ b/tools/perf/util/data_map.h @@ -23,6 +23,8 @@ struct perf_file_handler { void register_perf_file_handler(struct perf_file_handler *handler); int mmap_dispatch_perf_file(struct perf_header **pheader, const char *input_name, + const char *vmlinux_name, + bool try_vmlinux_path, int force, int full_paths, int *cwdlen, diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 1f771ce3a957..f1e392612652 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -69,13 +69,6 @@ struct build_id_event { char filename[]; }; -struct build_id_list { - struct build_id_event event; - struct list_head list; - const char *dso_name; - int len; -}; - typedef union event_union { struct perf_event_header header; struct ip_event ip; @@ -122,10 +115,13 @@ typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym); void map__init(struct map *self, u64 start, u64 end, u64 pgoff, struct dso *dso); struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen); +void map__delete(struct map *self); struct map *map__clone(struct map *self); int map__overlap(struct map *l, struct map *r); size_t map__fprintf(struct map *self, FILE *fp); struct symbol *map__find_symbol(struct map *self, u64 ip, symbol_filter_t filter); +void map__fixup_start(struct map *self); +void map__fixup_end(struct map *self); int event__synthesize_thread(pid_t pid, int (*process)(event_t *event)); void event__synthesize_threads(int (*process)(event_t *event)); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index b01a9537977f..1332f8ec04aa 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -78,16 +78,24 @@ struct perf_header *perf_header__new(void) return self; } +void perf_header__delete(struct perf_header *self) +{ + int i; + + for (i = 0; i < self->attrs; ++i) + perf_header_attr__delete(self->attr[i]); + + free(self->attr); + free(self); +} + int perf_header__add_attr(struct perf_header *self, struct perf_header_attr *attr) { - int pos = self->attrs; - if (self->frozen) return -1; - self->attrs++; - if (self->attrs > self->size) { + if (self->attrs == self->size) { int nsize = self->size * 2; struct perf_header_attr **nattr; @@ -98,7 +106,8 @@ int perf_header__add_attr(struct perf_header *self, self->size = nsize; self->attr = nattr; } - self->attr[pos] = attr; + + self->attr[self->attrs++] = attr; return 0; } @@ -167,7 +176,7 @@ static int do_write(int fd, const void *buf, size_t size) int ret = write(fd, buf, size); if (ret < 0) - return -1; + return -errno; size -= ret; buf += ret; @@ -176,43 +185,51 @@ static int do_write(int fd, const void *buf, size_t size) return 0; } -static int write_buildid_table(int fd, struct list_head *id_head) +static int dsos__write_buildid_table(int fd) { - struct build_id_list *iter, *next; - - list_for_each_entry_safe(iter, next, id_head, list) { - struct build_id_event *b = &iter->event; - - if (do_write(fd, b, sizeof(*b)) < 0 || - do_write(fd, iter->dso_name, iter->len) < 0) - return -1; - list_del(&iter->list); - free(iter); + struct dso *pos; + + list_for_each_entry(pos, &dsos, node) { + int err; + struct build_id_event b; + size_t len; + + if (!pos->has_build_id) + continue; + len = pos->long_name_len + 1; + len = ALIGN(len, 64); + memset(&b, 0, sizeof(b)); + memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id)); + b.header.size = sizeof(b) + len; + err = do_write(fd, &b, sizeof(b)); + if (err < 0) + return err; + err = do_write(fd, pos->long_name, len); + if (err < 0) + return err; } return 0; } -static void -perf_header__adds_write(struct perf_header *self, int fd) +static int perf_header__adds_write(struct perf_header *self, int fd) { - LIST_HEAD(id_list); int nr_sections; struct perf_file_section *feat_sec; int sec_size; u64 sec_start; - int idx = 0; + int idx = 0, err; - if (fetch_build_id_table(&id_list)) + if (dsos__read_build_ids()) perf_header__set_feat(self, HEADER_BUILD_ID); nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS); if (!nr_sections) - return; + return 0; feat_sec = calloc(sizeof(*feat_sec), nr_sections); - if (!feat_sec) - die("No memory"); + if (feat_sec == NULL) + return -ENOMEM; sec_size = sizeof(*feat_sec) * nr_sections; @@ -236,25 +253,37 @@ perf_header__adds_write(struct perf_header *self, int fd) buildid_sec = &feat_sec[idx++]; + /* + * Read the kernel buildid nad the list of loaded modules with + * its build_ids: + */ + kernel_maps__init(NULL, false, true); + /* Write build-ids */ buildid_sec->offset = lseek(fd, 0, SEEK_CUR); - if (write_buildid_table(fd, &id_list) < 0) - die("failed to write buildid table"); + err = dsos__write_buildid_table(fd); + if (err < 0) { + pr_debug("failed to write buildid table\n"); + goto out_free; + } buildid_sec->size = lseek(fd, 0, SEEK_CUR) - buildid_sec->offset; } lseek(fd, sec_start, SEEK_SET); - if (do_write(fd, feat_sec, sec_size) < 0) - die("failed to write feature section"); + err = do_write(fd, feat_sec, sec_size); + if (err < 0) + pr_debug("failed to write feature section\n"); +out_free: free(feat_sec); + return err; } -void perf_header__write(struct perf_header *self, int fd, bool at_exit) +int perf_header__write(struct perf_header *self, int fd, bool at_exit) { struct perf_file_header f_header; struct perf_file_attr f_attr; struct perf_header_attr *attr; - int i; + int i, err; lseek(fd, sizeof(f_header), SEEK_SET); @@ -263,8 +292,11 @@ void perf_header__write(struct perf_header *self, int fd, bool at_exit) attr = self->attr[i]; attr->id_offset = lseek(fd, 0, SEEK_CUR); - if (do_write(fd, attr->id, attr->ids * sizeof(u64)) < 0) - die("failed to write perf header"); + err = do_write(fd, attr->id, attr->ids * sizeof(u64)); + if (err < 0) { + pr_debug("failed to write perf header\n"); + return err; + } } @@ -280,20 +312,30 @@ void perf_header__write(struct perf_header *self, int fd, bool at_exit) .size = attr->ids * sizeof(u64), } }; - if (do_write(fd, &f_attr, sizeof(f_attr)) < 0) - die("failed to write perf header attribute"); + err = do_write(fd, &f_attr, sizeof(f_attr)); + if (err < 0) { + pr_debug("failed to write perf header attribute\n"); + return err; + } } self->event_offset = lseek(fd, 0, SEEK_CUR); self->event_size = event_count * sizeof(struct perf_trace_event_type); - if (events) - if (do_write(fd, events, self->event_size) < 0) - die("failed to write perf header events"); + if (events) { + err = do_write(fd, events, self->event_size); + if (err < 0) { + pr_debug("failed to write perf header events\n"); + return err; + } + } self->data_offset = lseek(fd, 0, SEEK_CUR); - if (at_exit) - perf_header__adds_write(self, fd); + if (at_exit) { + err = perf_header__adds_write(self, fd); + if (err < 0) + return err; + } f_header = (struct perf_file_header){ .magic = PERF_MAGIC, @@ -316,11 +358,15 @@ void perf_header__write(struct perf_header *self, int fd, bool at_exit) memcpy(&f_header.adds_features, &self->adds_features, sizeof(self->adds_features)); lseek(fd, 0, SEEK_SET); - if (do_write(fd, &f_header, sizeof(f_header)) < 0) - die("failed to write perf header"); + err = do_write(fd, &f_header, sizeof(f_header)); + if (err < 0) { + pr_debug("failed to write perf header\n"); + return err; + } lseek(fd, self->data_offset + self->data_size, SEEK_SET); self->frozen = 1; + return 0; } static void do_read(int fd, void *buf, size_t size) @@ -430,19 +476,17 @@ static int perf_file_section__process(struct perf_file_section *self, return 0; } -struct perf_header *perf_header__read(int fd) +int perf_header__read(struct perf_header *self, int fd) { - struct perf_header *self = perf_header__new(); struct perf_file_header f_header; struct perf_file_attr f_attr; u64 f_id; int nr_attrs, nr_ids, i, j; - if (self == NULL) - die("nomem"); - - if (perf_file_header__read(&f_header, self, fd) < 0) - die("incompatible file format"); + if (perf_file_header__read(&f_header, self, fd) < 0) { + pr_debug("incompatible file format\n"); + return -EINVAL; + } nr_attrs = f_header.attrs.size / sizeof(f_attr); lseek(fd, f_header.attrs.offset, SEEK_SET); @@ -456,7 +500,7 @@ struct perf_header *perf_header__read(int fd) attr = perf_header_attr__new(&f_attr.attr); if (attr == NULL) - die("nomem"); + return -ENOMEM; nr_ids = f_attr.ids.size / sizeof(u64); lseek(fd, f_attr.ids.offset, SEEK_SET); @@ -464,11 +508,15 @@ struct perf_header *perf_header__read(int fd) for (j = 0; j < nr_ids; j++) { do_read(fd, &f_id, sizeof(f_id)); - if (perf_header_attr__add_id(attr, f_id) < 0) - die("nomem"); + if (perf_header_attr__add_id(attr, f_id) < 0) { + perf_header_attr__delete(attr); + return -ENOMEM; + } + } + if (perf_header__add_attr(self, attr) < 0) { + perf_header_attr__delete(attr); + return -ENOMEM; } - if (perf_header__add_attr(self, attr) < 0) - die("nomem"); lseek(fd, tmp, SEEK_SET); } @@ -476,8 +524,8 @@ struct perf_header *perf_header__read(int fd) if (f_header.event_types.size) { lseek(fd, f_header.event_types.offset, SEEK_SET); events = malloc(f_header.event_types.size); - if (!events) - die("nomem"); + if (events == NULL) + return -ENOMEM; do_read(fd, events, f_header.event_types.size); event_count = f_header.event_types.size / sizeof(struct perf_trace_event_type); } @@ -487,8 +535,7 @@ struct perf_header *perf_header__read(int fd) lseek(fd, self->data_offset, SEEK_SET); self->frozen = 1; - - return self; + return 0; } u64 perf_header__sample_type(struct perf_header *header) diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index f46a94e09eea..d1dbe2b79c42 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -55,8 +55,11 @@ struct perf_header { DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS); }; -struct perf_header *perf_header__read(int fd); -void perf_header__write(struct perf_header *self, int fd, bool at_exit); +struct perf_header *perf_header__new(void); +void perf_header__delete(struct perf_header *self); + +int perf_header__read(struct perf_header *self, int fd); +int perf_header__write(struct perf_header *self, int fd, bool at_exit); int perf_header__add_attr(struct perf_header *self, struct perf_header_attr *attr); @@ -75,8 +78,6 @@ perf_header__find_attr(u64 id, struct perf_header *header); void perf_header__set_feat(struct perf_header *self, int feat); bool perf_header__has_feat(const struct perf_header *self, int feat); -struct perf_header *perf_header__new(void); - int perf_header__process_sections(struct perf_header *self, int fd, int (*process)(struct perf_file_section *self, int feat, int fd)); diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h index ace57c36d1d0..8d63116e9435 100644 --- a/tools/perf/util/include/linux/bitops.h +++ b/tools/perf/util/include/linux/bitops.h @@ -7,6 +7,8 @@ #define CONFIG_GENERIC_FIND_FIRST_BIT #include "../../../../include/linux/bitops.h" +#undef __KERNEL__ + static inline void set_bit(int nr, unsigned long *addr) { addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 94ca95073c40..09412321a80d 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -75,6 +75,29 @@ out_delete: return NULL; } +void map__delete(struct map *self) +{ + free(self); +} + +void map__fixup_start(struct map *self) +{ + struct rb_node *nd = rb_first(&self->dso->syms); + if (nd != NULL) { + struct symbol *sym = rb_entry(nd, struct symbol, rb_node); + self->start = sym->start; + } +} + +void map__fixup_end(struct map *self) +{ + struct rb_node *nd = rb_last(&self->dso->syms); + if (nd != NULL) { + struct symbol *sym = rb_entry(nd, struct symbol, rb_node); + self->end = sym->end; + } +} + #define DSO__DELETED "(deleted)" struct symbol * diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 0faf4f2bb5ca..070027469270 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1,4 +1,4 @@ - +#include "../../../include/linux/hw_breakpoint.h" #include "util.h" #include "../perf.h" #include "parse-options.h" @@ -540,6 +540,81 @@ static enum event_result parse_tracepoint_event(const char **strp, attr, strp); } +static enum event_result +parse_breakpoint_type(const char *type, const char **strp, + struct perf_event_attr *attr) +{ + int i; + + for (i = 0; i < 3; i++) { + if (!type[i]) + break; + + switch (type[i]) { + case 'r': + attr->bp_type |= HW_BREAKPOINT_R; + break; + case 'w': + attr->bp_type |= HW_BREAKPOINT_W; + break; + case 'x': + attr->bp_type |= HW_BREAKPOINT_X; + break; + default: + return EVT_FAILED; + } + } + if (!attr->bp_type) /* Default */ + attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W; + + *strp = type + i; + + return EVT_HANDLED; +} + +static enum event_result +parse_breakpoint_event(const char **strp, struct perf_event_attr *attr) +{ + const char *target; + const char *type; + char *endaddr; + u64 addr; + enum event_result err; + + target = strchr(*strp, ':'); + if (!target) + return EVT_FAILED; + + if (strncmp(*strp, "mem", target - *strp) != 0) + return EVT_FAILED; + + target++; + + addr = strtoull(target, &endaddr, 0); + if (target == endaddr) + return EVT_FAILED; + + attr->bp_addr = addr; + *strp = endaddr; + + type = strchr(target, ':'); + + /* If no type is defined, just rw as default */ + if (!type) { + attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W; + } else { + err = parse_breakpoint_type(++type, strp, attr); + if (err == EVT_FAILED) + return EVT_FAILED; + } + + /* We should find a nice way to override the access type */ + attr->bp_len = HW_BREAKPOINT_LEN_4; + attr->type = PERF_TYPE_BREAKPOINT; + + return EVT_HANDLED; +} + static int check_events(const char *str, unsigned int i) { int n; @@ -673,6 +748,10 @@ parse_event_symbols(const char **str, struct perf_event_attr *attr) if (ret != EVT_FAILED) goto modifier; + ret = parse_breakpoint_event(str, attr); + if (ret != EVT_FAILED) + goto modifier; + fprintf(stderr, "invalid or unsupported event: '%s'\n", *str); fprintf(stderr, "Run 'perf list' for a list of valid events\n"); return EVT_FAILED; @@ -859,6 +938,9 @@ void print_events(void) "rNNN"); printf("\n"); + printf(" %-42s [hardware breakpoint]\n", "mem:<addr>[:access]"); + printf("\n"); + print_tracepoint_events(); exit(129); diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 5cc96c86861b..44d81d5ae8cf 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -9,8 +9,13 @@ #include <libelf.h> #include <gelf.h> #include <elf.h> +#include <limits.h> #include <sys/utsname.h> +#ifndef NT_GNU_BUILD_ID +#define NT_GNU_BUILD_ID 3 +#endif + enum dso_origin { DSO__ORIG_KERNEL = 0, DSO__ORIG_JAVA_JIT, @@ -26,7 +31,11 @@ static void dsos__add(struct dso *dso); static struct dso *dsos__find(const char *name); static struct map *map__new2(u64 start, struct dso *dso); static void kernel_maps__insert(struct map *map); +static int dso__load_kernel_sym(struct dso *self, struct map *map, + symbol_filter_t filter); unsigned int symbol__priv_size; +static int vmlinux_path__nr_entries; +static char **vmlinux_path; static struct rb_root kernel_maps; @@ -69,11 +78,11 @@ static void kernel_maps__fixup_end(void) prev->end = curr->start - 1; } - nd = rb_last(&curr->dso->syms); - if (nd) { - struct symbol *sym = rb_entry(nd, struct symbol, rb_node); - curr->end = sym->end; - } + /* + * We still haven't the actual symbols, so guess the + * last map final address. + */ + curr->end = ~0UL; } static struct symbol *symbol__new(u64 start, u64 len, const char *name) @@ -111,6 +120,8 @@ static size_t symbol__fprintf(struct symbol *self, FILE *fp) static void dso__set_long_name(struct dso *self, char *name) { + if (name == NULL) + return; self->long_name = name; self->long_name_len = strlen(name); } @@ -323,7 +334,7 @@ out_failure: * kernel range is broken in several maps, named [kernel].N, as we don't have * the original ELF section names vmlinux have. */ -static int kernel_maps__split_kallsyms(symbol_filter_t filter, int use_modules) +static int kernel_maps__split_kallsyms(symbol_filter_t filter) { struct map *map = kernel_map; struct symbol *pos; @@ -339,9 +350,6 @@ static int kernel_maps__split_kallsyms(symbol_filter_t filter, int use_modules) module = strchr(pos->name, '\t'); if (module) { - if (!use_modules) - goto delete_symbol; - *module++ = '\0'; if (strcmp(map->dso->name, module)) { @@ -381,7 +389,6 @@ static int kernel_maps__split_kallsyms(symbol_filter_t filter, int use_modules) } if (filter && filter(map, pos)) { -delete_symbol: rb_erase(&pos->rb_node, &kernel_map->dso->syms); symbol__delete(pos); } else { @@ -397,17 +404,18 @@ delete_symbol: } -static int kernel_maps__load_kallsyms(symbol_filter_t filter, int use_modules) +static int kernel_maps__load_kallsyms(symbol_filter_t filter) { if (kernel_maps__load_all_kallsyms()) return -1; dso__fixup_sym_end(kernel_map->dso); + kernel_map->dso->origin = DSO__ORIG_KERNEL; - return kernel_maps__split_kallsyms(filter, use_modules); + return kernel_maps__split_kallsyms(filter); } -static size_t kernel_maps__fprintf(FILE *fp) +size_t kernel_maps__fprintf(FILE *fp) { size_t printed = fprintf(fp, "Kernel maps:\n"); struct rb_node *nd; @@ -883,47 +891,40 @@ out_close: return err; } -bool fetch_build_id_table(struct list_head *head) +static bool dso__build_id_equal(const struct dso *self, u8 *build_id) { - bool have_buildid = false; - struct dso *pos; - - list_for_each_entry(pos, &dsos, node) { - struct build_id_list *new; - struct build_id_event b; - size_t len; - - if (filename__read_build_id(pos->long_name, - &b.build_id, - sizeof(b.build_id)) < 0) - continue; - have_buildid = true; - memset(&b.header, 0, sizeof(b.header)); - len = pos->long_name_len + 1; - len = ALIGN(len, 64); - b.header.size = sizeof(b) + len; - - new = malloc(sizeof(*new)); - if (!new) - die("No memory\n"); + return memcmp(self->build_id, build_id, sizeof(self->build_id)) == 0; +} - memcpy(&new->event, &b, sizeof(b)); - new->dso_name = pos->long_name; - new->len = len; +bool dsos__read_build_ids(void) +{ + bool have_build_id = false; + struct dso *pos; - list_add_tail(&new->list, head); - } + list_for_each_entry(pos, &dsos, node) + if (filename__read_build_id(pos->long_name, pos->build_id, + sizeof(pos->build_id)) > 0) { + have_build_id = true; + pos->has_build_id = true; + } - return have_buildid; + return have_build_id; } +/* + * Align offset to 4 bytes as needed for note name and descriptor data. + */ +#define NOTE_ALIGN(n) (((n) + 3) & -4U) + int filename__read_build_id(const char *filename, void *bf, size_t size) { int fd, err = -1; GElf_Ehdr ehdr; GElf_Shdr shdr; - Elf_Data *build_id_data; + Elf_Data *data; Elf_Scn *sec; + Elf_Kind ek; + void *ptr; Elf *elf; if (size < BUILD_ID_SIZE) @@ -939,6 +940,10 @@ int filename__read_build_id(const char *filename, void *bf, size_t size) goto out_close; } + ek = elf_kind(elf); + if (ek != ELF_K_ELF) + goto out_elf_end; + if (gelf_getehdr(elf, &ehdr) == NULL) { pr_err("%s: cannot get elf header.\n", __func__); goto out_elf_end; @@ -946,14 +951,37 @@ int filename__read_build_id(const char *filename, void *bf, size_t size) sec = elf_section_by_name(elf, &ehdr, &shdr, ".note.gnu.build-id", NULL); - if (sec == NULL) - goto out_elf_end; + if (sec == NULL) { + sec = elf_section_by_name(elf, &ehdr, &shdr, + ".notes", NULL); + if (sec == NULL) + goto out_elf_end; + } - build_id_data = elf_getdata(sec, NULL); - if (build_id_data == NULL) + data = elf_getdata(sec, NULL); + if (data == NULL) goto out_elf_end; - memcpy(bf, build_id_data->d_buf + 16, BUILD_ID_SIZE); - err = BUILD_ID_SIZE; + + ptr = data->d_buf; + while (ptr < (data->d_buf + data->d_size)) { + GElf_Nhdr *nhdr = ptr; + int namesz = NOTE_ALIGN(nhdr->n_namesz), + descsz = NOTE_ALIGN(nhdr->n_descsz); + const char *name; + + ptr += sizeof(*nhdr); + name = ptr; + ptr += namesz; + if (nhdr->n_type == NT_GNU_BUILD_ID && + nhdr->n_namesz == sizeof("GNU")) { + if (memcmp(name, "GNU", sizeof("GNU")) == 0) { + memcpy(bf, ptr, BUILD_ID_SIZE); + err = BUILD_ID_SIZE; + break; + } + } + ptr += descsz; + } out_elf_end: elf_end(elf); out_close: @@ -962,23 +990,48 @@ out: return err; } -static char *dso__read_build_id(struct dso *self) +int sysfs__read_build_id(const char *filename, void *build_id, size_t size) { - int len; - char *build_id = NULL; - unsigned char rawbf[BUILD_ID_SIZE]; + int fd, err = -1; - len = filename__read_build_id(self->long_name, rawbf, sizeof(rawbf)); - if (len < 0) + if (size < BUILD_ID_SIZE) goto out; - build_id = malloc(len * 2 + 1); - if (build_id == NULL) + fd = open(filename, O_RDONLY); + if (fd < 0) goto out; - build_id__sprintf(rawbf, len, build_id); + while (1) { + char bf[BUFSIZ]; + GElf_Nhdr nhdr; + int namesz, descsz; + + if (read(fd, &nhdr, sizeof(nhdr)) != sizeof(nhdr)) + break; + + namesz = NOTE_ALIGN(nhdr.n_namesz); + descsz = NOTE_ALIGN(nhdr.n_descsz); + if (nhdr.n_type == NT_GNU_BUILD_ID && + nhdr.n_namesz == sizeof("GNU")) { + if (read(fd, bf, namesz) != namesz) + break; + if (memcmp(bf, "GNU", sizeof("GNU")) == 0) { + if (read(fd, build_id, + BUILD_ID_SIZE) == BUILD_ID_SIZE) { + err = 0; + break; + } + } else if (read(fd, bf, descsz) != descsz) + break; + } else { + int n = namesz + descsz; + if (read(fd, bf, n) != n) + break; + } + } + close(fd); out: - return build_id; + return err; } char dso__symtab_origin(const struct dso *self) @@ -1001,12 +1054,17 @@ char dso__symtab_origin(const struct dso *self) int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) { int size = PATH_MAX; - char *name = malloc(size), *build_id = NULL; + char *name; + u8 build_id[BUILD_ID_SIZE]; int ret = -1; int fd; self->loaded = 1; + if (self->kernel) + return dso__load_kernel_sym(self, map, filter); + + name = malloc(size); if (!name) return -1; @@ -1023,8 +1081,6 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter) more: do { - int berr = 0; - self->origin++; switch (self->origin) { case DSO__ORIG_FEDORA: @@ -1036,12 +1092,18 @@ more: self->long_name); break; case DSO__ORIG_BUILDID: - build_id = dso__read_build_id(self); - if (build_id != NULL) { + if (filename__read_build_id(self->long_name, build_id, + sizeof(build_id))) { + char build_id_hex[BUILD_ID_SIZE * 2 + 1]; + + build_id__sprintf(build_id, sizeof(build_id), + build_id_hex); snprintf(name, size, "/usr/lib/debug/.build-id/%.2s/%s.debug", - build_id, build_id + 2); - goto compare_build_id; + build_id_hex, build_id_hex + 2); + if (self->has_build_id) + goto compare_build_id; + break; } self->origin++; /* Fall thru */ @@ -1054,18 +1116,11 @@ more: } if (self->has_build_id) { - bool match; - build_id = malloc(BUILD_ID_SIZE); - if (build_id == NULL) + if (filename__read_build_id(name, build_id, + sizeof(build_id)) < 0) goto more; - berr = filename__read_build_id(name, build_id, - BUILD_ID_SIZE); compare_build_id: - match = berr > 0 && memcmp(build_id, self->build_id, - sizeof(self->build_id)) == 0; - free(build_id); - build_id = NULL; - if (!match) + if (!dso__build_id_equal(self, build_id)) goto more; } @@ -1100,7 +1155,8 @@ static void kernel_maps__insert(struct map *map) maps__insert(&kernel_maps, map); } -struct symbol *kernel_maps__find_symbol(u64 ip, struct map **mapp) +struct symbol *kernel_maps__find_symbol(u64 ip, struct map **mapp, + symbol_filter_t filter) { struct map *map = maps__find(&kernel_maps, ip); @@ -1109,7 +1165,7 @@ struct symbol *kernel_maps__find_symbol(u64 ip, struct map **mapp) if (map) { ip = map->map_ip(map, ip); - return map->dso->find_symbol(map->dso, ip); + return map__find_symbol(map, ip, filter); } return NULL; @@ -1129,32 +1185,13 @@ struct map *kernel_maps__find_by_dso_name(const char *name) return NULL; } -static int dso__load_module_sym(struct dso *self, struct map *map, - symbol_filter_t filter) -{ - int err = 0, fd = open(self->long_name, O_RDONLY); - - self->loaded = 1; - - if (fd < 0) { - pr_err("%s: cannot open %s\n", __func__, self->long_name); - return err; - } - - err = dso__load_sym(self, map, self->long_name, fd, filter, 0, 1); - close(fd); - - return err; -} - -static int dsos__load_modules_sym_dir(char *dirname, symbol_filter_t filter) +static int dsos__set_modules_path_dir(char *dirname) { struct dirent *dent; - int nr_symbols = 0, err; DIR *dir = opendir(dirname); if (!dir) { - pr_err("%s: cannot open %s dir\n", __func__, dirname); + pr_debug("%s: cannot open %s dir\n", __func__, dirname); return -1; } @@ -1168,14 +1205,12 @@ static int dsos__load_modules_sym_dir(char *dirname, symbol_filter_t filter) snprintf(path, sizeof(path), "%s/%s", dirname, dent->d_name); - err = dsos__load_modules_sym_dir(path, filter); - if (err < 0) + if (dsos__set_modules_path_dir(path) < 0) goto failure; } else { char *dot = strrchr(dent->d_name, '.'), dso_name[PATH_MAX]; struct map *map; - struct rb_node *last; char *long_name; if (dot == NULL || strcmp(dot, ".ko")) @@ -1195,36 +1230,16 @@ static int dsos__load_modules_sym_dir(char *dirname, symbol_filter_t filter) if (long_name == NULL) goto failure; dso__set_long_name(map->dso, long_name); - dso__set_basename(map->dso); - - err = dso__load_module_sym(map->dso, map, filter); - if (err < 0) - goto failure; - last = rb_last(&map->dso->syms); - if (last) { - struct symbol *sym; - /* - * We do this here as well, even having the - * symbol size found in the symtab because - * misannotated ASM symbols may have the size - * set to zero. - */ - dso__fixup_sym_end(map->dso); - - sym = rb_entry(last, struct symbol, rb_node); - map->end = map->start + sym->end; - } } - nr_symbols += err; } - return nr_symbols; + return 0; failure: closedir(dir); return -1; } -static int dsos__load_modules_sym(symbol_filter_t filter) +static int dsos__set_modules_path(void) { struct utsname uts; char modules_path[PATH_MAX]; @@ -1235,7 +1250,7 @@ static int dsos__load_modules_sym(symbol_filter_t filter) snprintf(modules_path, sizeof(modules_path), "/lib/modules/%s/kernel", uts.release); - return dsos__load_modules_sym_dir(modules_path, filter); + return dsos__set_modules_path_dir(modules_path); } /* @@ -1257,7 +1272,7 @@ static struct map *map__new2(u64 start, struct dso *dso) return self; } -static int dsos__load_modules(void) +static int kernel_maps__create_module_maps(void) { char *line = NULL; size_t n; @@ -1307,6 +1322,12 @@ static int dsos__load_modules(void) goto out_delete_line; } + snprintf(name, sizeof(name), + "/sys/module/%s/notes/.note.gnu.build-id", line); + if (sysfs__read_build_id(name, dso->build_id, + sizeof(dso->build_id)) == 0) + dso->has_build_id = true; + dso->origin = DSO__ORIG_KMODULE; kernel_maps__insert(map); dsos__add(dso); @@ -1315,7 +1336,7 @@ static int dsos__load_modules(void) free(line); fclose(file); - return 0; + return dsos__set_modules_path(); out_delete_line: free(line); @@ -1326,13 +1347,37 @@ out_failure: static int dso__load_vmlinux(struct dso *self, struct map *map, const char *vmlinux, symbol_filter_t filter) { - int err, fd = open(vmlinux, O_RDONLY); + int err = -1, fd; - self->loaded = 1; + if (self->has_build_id) { + u8 build_id[BUILD_ID_SIZE]; + + if (filename__read_build_id(vmlinux, build_id, + sizeof(build_id)) < 0) { + pr_debug("No build_id in %s, ignoring it\n", vmlinux); + return -1; + } + if (!dso__build_id_equal(self, build_id)) { + char expected_build_id[BUILD_ID_SIZE * 2 + 1], + vmlinux_build_id[BUILD_ID_SIZE * 2 + 1]; + + build_id__sprintf(self->build_id, + sizeof(self->build_id), + expected_build_id); + build_id__sprintf(build_id, sizeof(build_id), + vmlinux_build_id); + pr_debug("build_id in %s is %s while expected is %s, " + "ignoring it\n", vmlinux, vmlinux_build_id, + expected_build_id); + return -1; + } + } + fd = open(vmlinux, O_RDONLY); if (fd < 0) return -1; + self->loaded = 1; err = dso__load_sym(self, map, self->long_name, fd, filter, 1, 0); close(fd); @@ -1340,78 +1385,55 @@ static int dso__load_vmlinux(struct dso *self, struct map *map, return err; } -int dsos__load_kernel(const char *vmlinux, symbol_filter_t filter, - int use_modules) +static int dso__load_kernel_sym(struct dso *self, struct map *map, + symbol_filter_t filter) { - int err = -1; - struct dso *dso = dso__new(vmlinux); - - if (dso == NULL) - return -1; - - dso->short_name = "[kernel]"; - kernel_map = map__new2(0, dso); - if (kernel_map == NULL) - goto out_delete_dso; - - kernel_map->map_ip = kernel_map->unmap_ip = identity__map_ip; - - if (use_modules && dsos__load_modules() < 0) { - pr_warning("Failed to load list of modules in use! " - "Continuing...\n"); - use_modules = 0; - } - - if (vmlinux) { - err = dso__load_vmlinux(dso, kernel_map, vmlinux, filter); - if (err > 0 && use_modules) { - int syms = dsos__load_modules_sym(filter); - - if (syms < 0) - pr_warning("Failed to read module symbols!" - " Continuing...\n"); - else - err += syms; + int err; + bool is_kallsyms; + + if (vmlinux_path != NULL) { + int i; + pr_debug("Looking at the vmlinux_path (%d entries long)\n", + vmlinux_path__nr_entries); + for (i = 0; i < vmlinux_path__nr_entries; ++i) { + err = dso__load_vmlinux(self, map, vmlinux_path[i], + filter); + if (err > 0) { + pr_debug("Using %s for symbols\n", + vmlinux_path[i]); + dso__set_long_name(self, + strdup(vmlinux_path[i])); + goto out_fixup; + } } } - if (err <= 0) - err = kernel_maps__load_kallsyms(filter, use_modules); + is_kallsyms = self->long_name[0] == '['; + if (is_kallsyms) + goto do_kallsyms; + + err = dso__load_vmlinux(self, map, self->long_name, filter); + if (err <= 0) { + pr_info("The file %s cannot be used, " + "trying to use /proc/kallsyms...", self->long_name); + sleep(2); +do_kallsyms: + err = kernel_maps__load_kallsyms(filter); + if (err > 0 && !is_kallsyms) + dso__set_long_name(self, strdup("[kernel.kallsyms]")); + } if (err > 0) { - struct rb_node *node = rb_first(&dso->syms); - struct symbol *sym = rb_entry(node, struct symbol, rb_node); - - kernel_map->start = sym->start; - node = rb_last(&dso->syms); - sym = rb_entry(node, struct symbol, rb_node); - kernel_map->end = sym->end; - - dso->origin = DSO__ORIG_KERNEL; - kernel_maps__insert(kernel_map); - /* - * Now that we have all sorted out, just set the ->end of all - * maps: - */ - kernel_maps__fixup_end(); - dsos__add(dso); - - if (verbose) - kernel_maps__fprintf(stderr); +out_fixup: + map__fixup_start(map); + map__fixup_end(map); } return err; - -out_delete_dso: - dso__delete(dso); - return -1; } LIST_HEAD(dsos); -struct dso *vdso; - -const char *vmlinux_name = "vmlinux"; -int modules; +struct dso *vdso; static void dsos__add(struct dso *dso) { @@ -1463,18 +1485,117 @@ size_t dsos__fprintf_buildid(FILE *fp) return ret; } -int load_kernel(symbol_filter_t filter) +static int kernel_maps__create_kernel_map(const char *vmlinux_name) { - if (dsos__load_kernel(vmlinux_name, filter, modules) <= 0) + struct dso *kernel = dso__new(vmlinux_name ?: "[kernel.kallsyms]"); + + if (kernel == NULL) return -1; + kernel_map = map__new2(0, kernel); + if (kernel_map == NULL) + goto out_delete_kernel_dso; + + kernel_map->map_ip = kernel_map->unmap_ip = identity__map_ip; + kernel->short_name = "[kernel]"; + kernel->kernel = 1; + vdso = dso__new("[vdso]"); - if (!vdso) - return -1; + if (vdso == NULL) + goto out_delete_kernel_map; + + if (sysfs__read_build_id("/sys/kernel/notes", kernel->build_id, + sizeof(kernel->build_id)) == 0) + kernel->has_build_id = true; + kernel_maps__insert(kernel_map); + dsos__add(kernel); dsos__add(vdso); return 0; + +out_delete_kernel_map: + map__delete(kernel_map); + kernel_map = NULL; +out_delete_kernel_dso: + dso__delete(kernel); + return -1; +} + +static void vmlinux_path__exit(void) +{ + while (--vmlinux_path__nr_entries >= 0) { + free(vmlinux_path[vmlinux_path__nr_entries]); + vmlinux_path[vmlinux_path__nr_entries] = NULL; + } + + free(vmlinux_path); + vmlinux_path = NULL; +} + +static int vmlinux_path__init(void) +{ + struct utsname uts; + char bf[PATH_MAX]; + + if (uname(&uts) < 0) + return -1; + + vmlinux_path = malloc(sizeof(char *) * 5); + if (vmlinux_path == NULL) + return -1; + + vmlinux_path[vmlinux_path__nr_entries] = strdup("vmlinux"); + if (vmlinux_path[vmlinux_path__nr_entries] == NULL) + goto out_fail; + ++vmlinux_path__nr_entries; + vmlinux_path[vmlinux_path__nr_entries] = strdup("/boot/vmlinux"); + if (vmlinux_path[vmlinux_path__nr_entries] == NULL) + goto out_fail; + ++vmlinux_path__nr_entries; + snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release); + vmlinux_path[vmlinux_path__nr_entries] = strdup(bf); + if (vmlinux_path[vmlinux_path__nr_entries] == NULL) + goto out_fail; + ++vmlinux_path__nr_entries; + snprintf(bf, sizeof(bf), "/lib/modules/%s/build/vmlinux", uts.release); + vmlinux_path[vmlinux_path__nr_entries] = strdup(bf); + if (vmlinux_path[vmlinux_path__nr_entries] == NULL) + goto out_fail; + ++vmlinux_path__nr_entries; + snprintf(bf, sizeof(bf), "/usr/lib/debug/lib/modules/%s/vmlinux", + uts.release); + vmlinux_path[vmlinux_path__nr_entries] = strdup(bf); + if (vmlinux_path[vmlinux_path__nr_entries] == NULL) + goto out_fail; + ++vmlinux_path__nr_entries; + + return 0; + +out_fail: + vmlinux_path__exit(); + return -1; +} + +int kernel_maps__init(const char *vmlinux_name, bool try_vmlinux_path, + bool use_modules) +{ + if (try_vmlinux_path && vmlinux_path__init() < 0) + return -1; + + if (kernel_maps__create_kernel_map(vmlinux_name) < 0) { + vmlinux_path__exit(); + return -1; + } + + if (use_modules && kernel_maps__create_module_maps() < 0) + pr_debug("Failed to load list of modules in use, " + "continuing...\n"); + /* + * Now that we have all the maps created, just set the ->end of them: + */ + kernel_maps__fixup_end(); + return 0; } void symbol__init(unsigned int priv_size) diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 5ad1019607dd..8c4d026e067a 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -64,6 +64,7 @@ struct dso { u8 slen_calculated:1; u8 loaded:1; u8 has_build_id:1; + u8 kernel:1; unsigned char origin; u8 build_id[BUILD_ID_SIZE]; u16 long_name_len; @@ -77,7 +78,6 @@ void dso__delete(struct dso *self); struct symbol *dso__find_symbol(struct dso *self, u64 ip); -int dsos__load_kernel(const char *vmlinux, symbol_filter_t filter, int modules); struct dso *dsos__findnew(const char *name); int dso__load(struct dso *self, struct map *map, symbol_filter_t filter); void dsos__fprintf(FILE *fp); @@ -89,16 +89,17 @@ char dso__symtab_origin(const struct dso *self); void dso__set_build_id(struct dso *self, void *build_id); int filename__read_build_id(const char *filename, void *bf, size_t size); -bool fetch_build_id_table(struct list_head *head); +int sysfs__read_build_id(const char *filename, void *bf, size_t size); +bool dsos__read_build_ids(void); int build_id__sprintf(u8 *self, int len, char *bf); -int load_kernel(symbol_filter_t filter); +int kernel_maps__init(const char *vmlinux_name, bool try_vmlinux_path, + bool use_modules); +size_t kernel_maps__fprintf(FILE *fp); void symbol__init(unsigned int priv_size); extern struct list_head dsos; extern struct map *kernel_map; extern struct dso *vdso; -extern const char *vmlinux_name; -extern int modules; #endif /* __PERF_SYMBOL */ diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 53addd77ce8f..e4b8d437725a 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -26,7 +26,8 @@ size_t threads__fprintf(FILE *fp); void maps__insert(struct rb_root *maps, struct map *map); struct map *maps__find(struct rb_root *maps, u64 ip); -struct symbol *kernel_maps__find_symbol(const u64 ip, struct map **mapp); +struct symbol *kernel_maps__find_symbol(const u64 ip, struct map **mapp, + symbol_filter_t filter); struct map *kernel_maps__find_by_dso_name(const char *name); static inline struct map *thread__find_map(struct thread *self, u64 ip) diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index 831052d4b4fb..cace35595530 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -33,11 +33,11 @@ #include <ctype.h> #include <errno.h> #include <stdbool.h> +#include <linux/kernel.h> #include "../perf.h" #include "trace-event.h" - #define VERSION "0.5" #define _STR(x) #x @@ -483,23 +483,31 @@ static struct tracepoint_path * get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events) { struct tracepoint_path path, *ppath = &path; - int i; + int i, nr_tracepoints = 0; for (i = 0; i < nb_events; i++) { if (pattrs[i].type != PERF_TYPE_TRACEPOINT) continue; + ++nr_tracepoints; ppath->next = tracepoint_id_to_path(pattrs[i].config); if (!ppath->next) die("%s\n", "No memory to alloc tracepoints list"); ppath = ppath->next; } - return path.next; + return nr_tracepoints > 0 ? path.next : NULL; } -void read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events) + +int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events) { char buf[BUFSIZ]; - struct tracepoint_path *tps; + struct tracepoint_path *tps = get_tracepoints_path(pattrs, nb_events); + + /* + * What? No tracepoints? No sense writing anything here, bail out. + */ + if (tps == NULL) + return -1; output_fd = fd; @@ -528,11 +536,11 @@ void read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events) page_size = getpagesize(); write_or_die(&page_size, 4); - tps = get_tracepoints_path(pattrs, nb_events); - read_header_files(); read_ftrace_files(tps); read_event_files(tps); read_proc_kallsyms(); read_ftrace_printk(); + + return 0; } diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index 44292e06cca4..342dfdd43f87 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -471,11 +471,11 @@ void trace_report(int fd) read_or_die(buf, 3); if (memcmp(buf, test, 3) != 0) - die("not an trace data file"); + die("no trace data in the file"); read_or_die(buf, 7); if (memcmp(buf, "tracing", 7) != 0) - die("not a trace file (missing tracing)"); + die("not a trace file (missing 'tracing' tag)"); version = read_string(); if (show_version) diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index f6637c2fa1fe..dd51c6872a15 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -248,7 +248,7 @@ unsigned long long raw_field_value(struct event *event, const char *name, void *data); void *raw_field_ptr(struct event *event, const char *name, void *data); -void read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events); +int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events); /* taken from kernel/trace/trace.h */ enum trace_flag_type { diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index f2203a0946bc..e1c623e0c99e 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -84,6 +84,9 @@ #include <iconv.h> #endif +extern const char *graph_line; +extern const char *graph_dotted_line; + /* On most systems <limits.h> would have given us this, but * not on some systems (e.g. GNU/Hurd). */ |