diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 201 |
1 files changed, 123 insertions, 78 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 953c14348375..f5744010a8d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; -static atomic_t perf_sample_allowed_ns __read_mostly = - ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); +static int perf_sample_allowed_ns __read_mostly = + DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; void update_perf_cpu_limits(void) { @@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) tmp *= sysctl_perf_cpu_time_max_percent; do_div(tmp, 100); - atomic_set(&perf_sample_allowed_ns, tmp); + ACCESS_ONCE(perf_sample_allowed_ns) = tmp; } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 -DEFINE_PER_CPU(u64, running_sample_length); +static DEFINE_PER_CPU(u64, running_sample_length); void perf_sample_event_took(u64 sample_len_ns) { u64 avg_local_sample_len; u64 local_samples_len; + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - if (atomic_read(&perf_sample_allowed_ns) == 0) + if (allowed_ns == 0) return; /* decay the counter by 1 average sample */ @@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) */ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + if (avg_local_sample_len <= allowed_ns) return; if (max_samples_per_tick <= 1) @@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %d), lowering " + "perf samples too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, - atomic_read(&perf_sample_allowed_ns), + avg_local_sample_len, allowed_ns, sysctl_perf_event_sample_rate); update_perf_cpu_limits(); @@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); ctx->parent_ctx = NULL; } + ctx->generation++; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; + + ctx->generation++; } /* @@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + size += sizeof(data->txn); + event->header_size = size; } @@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; + + ctx->generation++; } static void perf_group_detach(struct perf_event *event) @@ -1388,6 +1396,8 @@ event_sched_out(struct perf_event *event, if (event->state != PERF_EVENT_STATE_ACTIVE) return; + perf_pmu_disable(event->pmu); + event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; @@ -1404,6 +1414,8 @@ event_sched_out(struct perf_event *event, ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + + perf_pmu_enable(event->pmu); } static void @@ -1644,6 +1656,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { u64 tstamp = perf_event_time(event); + int ret = 0; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -1666,10 +1679,13 @@ event_sched_in(struct perf_event *event, */ smp_wmb(); + perf_pmu_disable(event->pmu); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; - return -EAGAIN; + ret = -EAGAIN; + goto out; } event->tstamp_running += tstamp - event->tstamp_stopped; @@ -1685,7 +1701,10 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - return 0; +out: + perf_pmu_enable(event->pmu); + + return ret; } static int @@ -2146,22 +2165,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, } /* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. + * Test whether two contexts are equivalent, i.e. whether they have both been + * cloned from the same version of the same context. + * + * Equivalence is measured using a generation number in the context that is + * incremented on each modification to it; see unclone_ctx(), list_add_event() + * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; + /* Pinning disables the swap optimization */ + if (ctx1->pin_count || ctx2->pin_count) + return 0; + + /* If ctx1 is the parent of ctx2 */ + if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) + return 1; + + /* If ctx2 is the parent of ctx1 */ + if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) + return 1; + + /* + * If ctx1 and ctx2 have the same parent; we flatten the parent + * hierarchy, see perf_event_init_context(). + */ + if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && + ctx1->parent_gen == ctx2->parent_gen) + return 1; + + /* Unmatched */ + return 0; } static void __perf_event_sync_stat(struct perf_event *event, @@ -2210,9 +2245,6 @@ static void __perf_event_sync_stat(struct perf_event *event, perf_event_update_userpage(next_event); } -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { @@ -2244,7 +2276,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; - struct perf_event_context *parent; + struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; @@ -2256,10 +2288,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, return; rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp[ctxn]; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { + if (!next_ctx) + goto unlock; + + parent = rcu_dereference(ctx->parent_ctx); + next_parent = rcu_dereference(next_ctx->parent_ctx); + + /* If neither context have a parent context; they cannot be clones. */ + if (!parent && !next_parent) + goto unlock; + + if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both @@ -2287,6 +2327,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } +unlock: rcu_read_unlock(); if (do_switch) { @@ -2713,6 +2754,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + perf_pmu_disable(event->pmu); + hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { @@ -2722,7 +2765,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, } if (!event->attr.freq || !event->attr.sample_freq) - continue; + goto next; /* * stop the event and update event->count @@ -2744,6 +2787,8 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); + next: + perf_pmu_enable(event->pmu); } perf_pmu_enable(ctx->pmu); @@ -4572,6 +4617,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + perf_output_put(handle, data->txn); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5100,27 +5148,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); + char *name; if (file) { struct inode *inode; dev_t dev; + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = "//enomem"; + goto cpy_name; + } /* - * d_path works from the end of the rb backwards, so we + * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); + name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; + name = "//toolong"; + goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; @@ -5128,34 +5175,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); - + goto got_name; } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp) - 1); - tmp[sizeof(tmp) - 1] = '\0'; - goto got_name; - } + name = (char *)arch_vma_name(vma); + if (name) + goto cpy_name; - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_brk && + if (vma->vm_start <= vma->vm_mm->start_brk && vma->vm_end >= vma->vm_mm->brk) { - name = strncpy(tmp, "[heap]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_stack && + name = "[heap]"; + goto cpy_name; + } + if (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { - name = strncpy(tmp, "[stack]", sizeof(tmp)); - goto got_name; + name = "[stack]"; + goto cpy_name; } - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; + name = "//anon"; + goto cpy_name; } +cpy_name: + strlcpy(tmp, name, sizeof(tmp)); + name = tmp; got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(name)+1; + while (!IS_ALIGNED(size, sizeof(u64))) + name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; @@ -5643,11 +5695,6 @@ static void swevent_hlist_put(struct perf_event *event) { int cpu; - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - for_each_possible_cpu(cpu) swevent_hlist_put_cpu(event, cpu); } @@ -5681,9 +5728,6 @@ static int swevent_hlist_get(struct perf_event *event) int err; int cpu, failed_cpu; - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - get_online_cpus(); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(event, cpu); @@ -6292,6 +6336,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } +static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, @@ -6336,17 +6381,19 @@ perf_event_mux_interval_ms_store(struct device *dev, return count; } +static DEVICE_ATTR_RW(perf_event_mux_interval_ms); -static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_RW(perf_event_mux_interval_ms), - __ATTR_NULL, +static struct attribute *pmu_dev_attrs[] = { + &dev_attr_type.attr, + &dev_attr_perf_event_mux_interval_ms.attr, + NULL, }; +ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", - .dev_attrs = pmu_dev_attrs, + .dev_groups = pmu_dev_groups, }; static void pmu_dev_release(struct device *dev) @@ -7126,7 +7173,6 @@ SYSCALL_DEFINE5(perf_event_open, } perf_install_in_context(ctx, event, event->cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -7209,7 +7255,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); |