summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/hrtimer.c57
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/perf_counter.c394
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/console.c63
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_fair.c1
-rw-r--r--kernel/time.c9
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/timekeeping.c535
-rw-r--r--kernel/timer.c28
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c23
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c49
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_entries.h17
-rw-r--r--kernel/trace/trace_event_profile.c82
-rw-r--r--kernel/trace/trace_events.c49
-rw-r--r--kernel/trace/trace_power.c218
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_syscalls.c97
30 files changed, 1295 insertions, 2051 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 3d9c7e27e3f9..7c9b0a585502 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -87,7 +87,6 @@ obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
-obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/taskstats.h>
#include <linux/time.h>
#include <linux/sysctl.h>
#include <linux/delayacct.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 05071bf6a37b..c03f221fee44 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,37 +48,6 @@
#include <asm/uaccess.h>
-/**
- * ktime_get - get the monotonic time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get(void)
-{
- struct timespec now;
-
- ktime_get_ts(&now);
-
- return timespec_to_ktime(now);
-}
-EXPORT_SYMBOL_GPL(ktime_get);
-
-/**
- * ktime_get_real - get the real (wall-) time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get_real(void)
-{
- struct timespec now;
-
- getnstimeofday(&now);
-
- return timespec_to_ktime(now);
-}
-
-EXPORT_SYMBOL_GPL(ktime_get_real);
-
/*
* The timer bases:
*
@@ -106,31 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
}
};
-/**
- * ktime_get_ts - get the monotonic clock in timespec format
- * @ts: pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
- */
-void ktime_get_ts(struct timespec *ts)
-{
- struct timespec tomono;
- unsigned long seq;
-
- do {
- seq = read_seqbegin(&xtime_lock);
- getnstimeofday(ts);
- tomono = wall_to_monotonic;
-
- } while (read_seqretry(&xtime_lock, seq));
-
- set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
- ts->tv_nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
-
/*
* Get the coarse grained time at the softirq based on xtime and
* wall_to_monotonic.
@@ -1155,7 +1099,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
clock_id = CLOCK_MONOTONIC;
timer->base = &cpu_base->clock_base[clock_id];
- INIT_LIST_HEAD(&timer->cb_entry);
hrtimer_init_timer_hres(timer);
#ifdef CONFIG_TIMER_STATS
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
* writer, you don't need extra locking to use these functions.
*/
unsigned int __kfifo_put(struct kfifo *fifo,
- unsigned char *buffer, unsigned int len)
+ const unsigned char *buffer, unsigned int len)
{
unsigned int l;
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
- * Copyright (C) 2007 Mathieu Desnoyers
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/types.h>
-#include <linux/jhash.h>
-#include <linux/list.h>
-#include <linux/rcupdate.h>
-#include <linux/marker.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-extern struct marker __start___markers[];
-extern struct marker __stop___markers[];
-
-/* Set to 1 to enable marker debug output */
-static const int marker_debug;
-
-/*
- * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers and the hash table.
- */
-static DEFINE_MUTEX(markers_mutex);
-
-/*
- * Marker hash table, containing the active markers.
- * Protected by module_mutex.
- */
-#define MARKER_HASH_BITS 6
-#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-
-/*
- * Note about RCU :
- * It is used to make sure every handler has finished using its private data
- * between two consecutive operation (add or remove) on a given marker. It is
- * also used to delay the free of multiple probes array until a quiescent state
- * is reached.
- * marker entries modifications are protected by the markers_mutex.
- */
-struct marker_entry {
- struct hlist_node hlist;
- char *format;
- /* Probe wrapper */
- void (*call)(const struct marker *mdata, void *call_private, ...);
- struct marker_probe_closure single;
- struct marker_probe_closure *multi;
- int refcount; /* Number of times armed. 0 if disarmed. */
- struct rcu_head rcu;
- void *oldptr;
- int rcu_pending;
- unsigned char ptype:1;
- unsigned char format_allocated:1;
- char name[0]; /* Contains name'\0'format'\0' */
-};
-
-/**
- * __mark_empty_function - Empty probe callback
- * @probe_private: probe private data
- * @call_private: call site private data
- * @fmt: format string
- * @...: variable argument list
- *
- * Empty callback provided as a probe to the markers. By providing this to a
- * disabled marker, we make sure the execution flow is always valid even
- * though the function pointer change and the marker enabling are two distinct
- * operations that modifies the execution flow of preemptible code.
- */
-notrace void __mark_empty_function(void *probe_private, void *call_private,
- const char *fmt, va_list *args)
-{
-}
-EXPORT_SYMBOL_GPL(__mark_empty_function);
-
-/*
- * marker_probe_cb Callback that prepares the variable argument list for probes.
- * @mdata: pointer of type struct marker
- * @call_private: caller site private data
- * @...: Variable argument list.
- *
- * Since we do not use "typical" pointer based RCU in the 1 argument case, we
- * need to put a full smp_rmb() in this branch. This is why we do not use
- * rcu_dereference() for the pointer read.
- */
-notrace void marker_probe_cb(const struct marker *mdata,
- void *call_private, ...)
-{
- va_list args;
- char ptype;
-
- /*
- * rcu_read_lock_sched does two things : disabling preemption to make
- * sure the teardown of the callbacks can be done correctly when they
- * are in modules and they insure RCU read coherency.
- */
- rcu_read_lock_sched_notrace();
- ptype = mdata->ptype;
- if (likely(!ptype)) {
- marker_probe_func *func;
- /* Must read the ptype before ptr. They are not data dependant,
- * so we put an explicit smp_rmb() here. */
- smp_rmb();
- func = mdata->single.func;
- /* Must read the ptr before private data. They are not data
- * dependant, so we put an explicit smp_rmb() here. */
- smp_rmb();
- va_start(args, call_private);
- func(mdata->single.probe_private, call_private, mdata->format,
- &args);
- va_end(args);
- } else {
- struct marker_probe_closure *multi;
- int i;
- /*
- * Read mdata->ptype before mdata->multi.
- */
- smp_rmb();
- multi = mdata->multi;
- /*
- * multi points to an array, therefore accessing the array
- * depends on reading multi. However, even in this case,
- * we must insure that the pointer is read _before_ the array
- * data. Same as rcu_dereference, but we need a full smp_rmb()
- * in the fast path, so put the explicit barrier here.
- */
- smp_read_barrier_depends();
- for (i = 0; multi[i].func; i++) {
- va_start(args, call_private);
- multi[i].func(multi[i].probe_private, call_private,
- mdata->format, &args);
- va_end(args);
- }
- }
- rcu_read_unlock_sched_notrace();
-}
-EXPORT_SYMBOL_GPL(marker_probe_cb);
-
-/*
- * marker_probe_cb Callback that does not prepare the variable argument list.
- * @mdata: pointer of type struct marker
- * @call_private: caller site private data
- * @...: Variable argument list.
- *
- * Should be connected to markers "MARK_NOARGS".
- */
-static notrace void marker_probe_cb_noarg(const struct marker *mdata,
- void *call_private, ...)
-{
- va_list args; /* not initialized */
- char ptype;
-
- rcu_read_lock_sched_notrace();
- ptype = mdata->ptype;
- if (likely(!ptype)) {
- marker_probe_func *func;
- /* Must read the ptype before ptr. They are not data dependant,
- * so we put an explicit smp_rmb() here. */
- smp_rmb();
- func = mdata->single.func;
- /* Must read the ptr before private data. They are not data
- * dependant, so we put an explicit smp_rmb() here. */
- smp_rmb();
- func(mdata->single.probe_private, call_private, mdata->format,
- &args);
- } else {
- struct marker_probe_closure *multi;
- int i;
- /*
- * Read mdata->ptype before mdata->multi.
- */
- smp_rmb();
- multi = mdata->multi;
- /*
- * multi points to an array, therefore accessing the array
- * depends on reading multi. However, even in this case,
- * we must insure that the pointer is read _before_ the array
- * data. Same as rcu_dereference, but we need a full smp_rmb()
- * in the fast path, so put the explicit barrier here.
- */
- smp_read_barrier_depends();
- for (i = 0; multi[i].func; i++)
- multi[i].func(multi[i].probe_private, call_private,
- mdata->format, &args);
- }
- rcu_read_unlock_sched_notrace();
-}
-
-static void free_old_closure(struct rcu_head *head)
-{
- struct marker_entry *entry = container_of(head,
- struct marker_entry, rcu);
- kfree(entry->oldptr);
- /* Make sure we free the data before setting the pending flag to 0 */
- smp_wmb();
- entry->rcu_pending = 0;
-}
-
-static void debug_print_probes(struct marker_entry *entry)
-{
- int i;
-
- if (!marker_debug)
- return;
-
- if (!entry->ptype) {
- printk(KERN_DEBUG "Single probe : %p %p\n",
- entry->single.func,
- entry->single.probe_private);
- } else {
- for (i = 0; entry->multi[i].func; i++)
- printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
- entry->multi[i].func,
- entry->multi[i].probe_private);
- }
-}
-
-static struct marker_probe_closure *
-marker_entry_add_probe(struct marker_entry *entry,
- marker_probe_func *probe, void *probe_private)
-{
- int nr_probes = 0;
- struct marker_probe_closure *old, *new;
-
- WARN_ON(!probe);
-
- debug_print_probes(entry);
- old = entry->multi;
- if (!entry->ptype) {
- if (entry->single.func == probe &&
- entry->single.probe_private == probe_private)
- return ERR_PTR(-EBUSY);
- if (entry->single.func == __mark_empty_function) {
- /* 0 -> 1 probes */
- entry->single.func = probe;
- entry->single.probe_private = probe_private;
- entry->refcount = 1;
- entry->ptype = 0;
- debug_print_probes(entry);
- return NULL;
- } else {
- /* 1 -> 2 probes */
- nr_probes = 1;
- old = NULL;
- }
- } else {
- /* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++)
- if (old[nr_probes].func == probe
- && old[nr_probes].probe_private
- == probe_private)
- return ERR_PTR(-EBUSY);
- }
- /* + 2 : one for new probe, one for NULL func */
- new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
- GFP_KERNEL);
- if (new == NULL)
- return ERR_PTR(-ENOMEM);
- if (!old)
- new[0] = entry->single;
- else
- memcpy(new, old,
- nr_probes * sizeof(struct marker_probe_closure));
- new[nr_probes].func = probe;
- new[nr_probes].probe_private = probe_private;
- entry->refcount = nr_probes + 1;
- entry->multi = new;
- entry->ptype = 1;
- debug_print_probes(entry);
- return old;
-}
-
-static struct marker_probe_closure *
-marker_entry_remove_probe(struct marker_entry *entry,
- marker_probe_func *probe, void *probe_private)
-{
- int nr_probes = 0, nr_del = 0, i;
- struct marker_probe_closure *old, *new;
-
- old = entry->multi;
-
- debug_print_probes(entry);
- if (!entry->ptype) {
- /* 0 -> N is an error */
- WARN_ON(entry->single.func == __mark_empty_function);
- /* 1 -> 0 probes */
- WARN_ON(probe && entry->single.func != probe);
- WARN_ON(entry->single.probe_private != probe_private);
- entry->single.func = __mark_empty_function;
- entry->refcount = 0;
- entry->ptype = 0;
- debug_print_probes(entry);
- return NULL;
- } else {
- /* (N -> M), (N > 1, M >= 0) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if ((!probe || old[nr_probes].func == probe)
- && old[nr_probes].probe_private
- == probe_private)
- nr_del++;
- }
- }
-
- if (nr_probes - nr_del == 0) {
- /* N -> 0, (N > 1) */
- entry->single.func = __mark_empty_function;
- entry->refcount = 0;
- entry->ptype = 0;
- } else if (nr_probes - nr_del == 1) {
- /* N -> 1, (N > 1) */
- for (i = 0; old[i].func; i++)
- if ((probe && old[i].func != probe) ||
- old[i].probe_private != probe_private)
- entry->single = old[i];
- entry->refcount = 1;
- entry->ptype = 0;
- } else {
- int j = 0;
- /* N -> M, (N > 1, M > 1) */
- /* + 1 for NULL */
- new = kzalloc((nr_probes - nr_del + 1)
- * sizeof(struct marker_probe_closure), GFP_KERNEL);
- if (new == NULL)
- return ERR_PTR(-ENOMEM);
- for (i = 0; old[i].func; i++)
- if ((probe && old[i].func != probe) ||
- old[i].probe_private != probe_private)
- new[j++] = old[i];
- entry->refcount = nr_probes - nr_del;
- entry->ptype = 1;
- entry->multi = new;
- }
- debug_print_probes(entry);
- return old;
-}
-
-/*
- * Get marker if the marker is present in the marker hash table.
- * Must be called with markers_mutex held.
- * Returns NULL if not present.
- */
-static struct marker_entry *get_marker(const char *name)
-{
- struct hlist_head *head;
- struct hlist_node *node;
- struct marker_entry *e;
- u32 hash = jhash(name, strlen(name), 0);
-
- head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
- hlist_for_each_entry(e, node, head, hlist) {
- if (!strcmp(name, e->name))
- return e;
- }
- return NULL;
-}
-
-/*
- * Add the marker to the marker hash table. Must be called with markers_mutex
- * held.
- */
-static struct marker_entry *add_marker(const char *name, const char *format)
-{
- struct hlist_head *head;
- struct hlist_node *node;
- struct marker_entry *e;
- size_t name_len = strlen(name) + 1;
- size_t format_len = 0;
- u32 hash = jhash(name, name_len-1, 0);
-
- if (format)
- format_len = strlen(format) + 1;
- head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
- hlist_for_each_entry(e, node, head, hlist) {
- if (!strcmp(name, e->name)) {
- printk(KERN_NOTICE
- "Marker %s busy\n", name);
- return ERR_PTR(-EBUSY); /* Already there */
- }
- }
- /*
- * Using kmalloc here to allocate a variable length element. Could
- * cause some memory fragmentation if overused.
- */
- e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
- GFP_KERNEL);
- if (!e)
- return ERR_PTR(-ENOMEM);
- memcpy(&e->name[0], name, name_len);
- if (format) {
- e->format = &e->name[name_len];
- memcpy(e->format, format, format_len);
- if (strcmp(e->format, MARK_NOARGS) == 0)
- e->call = marker_probe_cb_noarg;
- else
- e->call = marker_probe_cb;
- trace_mark(core_marker_format, "name %s format %s",
- e->name, e->format);
- } else {
- e->format = NULL;
- e->call = marker_probe_cb;
- }
- e->single.func = __mark_empty_function;
- e->single.probe_private = NULL;
- e->multi = NULL;
- e->ptype = 0;
- e->format_allocated = 0;
- e->refcount = 0;
- e->rcu_pending = 0;
- hlist_add_head(&e->hlist, head);
- return e;
-}
-
-/*
- * Remove the marker from the marker hash table. Must be called with mutex_lock
- * held.
- */
-static int remove_marker(const char *name)
-{
- struct hlist_head *head;
- struct hlist_node *node;
- struct marker_entry *e;
- int found = 0;
- size_t len = strlen(name) + 1;
- u32 hash = jhash(name, len-1, 0);
-
- head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
- hlist_for_each_entry(e, node, head, hlist) {
- if (!strcmp(name, e->name)) {
- found = 1;
- break;
- }
- }
- if (!found)
- return -ENOENT;
- if (e->single.func != __mark_empty_function)
- return -EBUSY;
- hlist_del(&e->hlist);
- if (e->format_allocated)
- kfree(e->format);
- /* Make sure the call_rcu has been executed */
- if (e->rcu_pending)
- rcu_barrier_sched();
- kfree(e);
- return 0;
-}
-
-/*
- * Set the mark_entry format to the format found in the element.
- */
-static int marker_set_format(struct marker_entry *entry, const char *format)
-{
- entry->format = kstrdup(format, GFP_KERNEL);
- if (!entry->format)
- return -ENOMEM;
- entry->format_allocated = 1;
-
- trace_mark(core_marker_format, "name %s format %s",
- entry->name, entry->format);
- return 0;
-}
-
-/*
- * Sets the probe callback corresponding to one marker.
- */
-static int set_marker(struct marker_entry *entry, struct marker *elem,
- int active)
-{
- int ret = 0;
- WARN_ON(strcmp(entry->name, elem->name) != 0);
-
- if (entry->format) {
- if (strcmp(entry->format, elem->format) != 0) {
- printk(KERN_NOTICE
- "Format mismatch for probe %s "
- "(%s), marker (%s)\n",
- entry->name,
- entry->format,
- elem->format);
- return -EPERM;
- }
- } else {
- ret = marker_set_format(entry, elem->format);
- if (ret)
- return ret;
- }
-
- /*
- * probe_cb setup (statically known) is done here. It is
- * asynchronous with the rest of execution, therefore we only
- * pass from a "safe" callback (with argument) to an "unsafe"
- * callback (does not set arguments).
- */
- elem->call = entry->call;
- /*
- * Sanity check :
- * We only update the single probe private data when the ptr is
- * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
- */
- WARN_ON(elem->single.func != __mark_empty_function
- && elem->single.probe_private != entry->single.probe_private
- && !elem->ptype);
- elem->single.probe_private = entry->single.probe_private;
- /*
- * Make sure the private data is valid when we update the
- * single probe ptr.
- */
- smp_wmb();
- elem->single.func = entry->single.func;
- /*
- * We also make sure that the new probe callbacks array is consistent
- * before setting a pointer to it.
- */
- rcu_assign_pointer(elem->multi, entry->multi);
- /*
- * Update the function or multi probe array pointer before setting the
- * ptype.
- */
- smp_wmb();
- elem->ptype = entry->ptype;
-
- if (elem->tp_name && (active ^ elem->state)) {
- WARN_ON(!elem->tp_cb);
- /*
- * It is ok to directly call the probe registration because type
- * checking has been done in the __trace_mark_tp() macro.
- */
-
- if (active) {
- /*
- * try_module_get should always succeed because we hold
- * lock_module() to get the tp_cb address.
- */
- ret = try_module_get(__module_text_address(
- (unsigned long)elem->tp_cb));
- BUG_ON(!ret);
- ret = tracepoint_probe_register_noupdate(
- elem->tp_name,
- elem->tp_cb);
- } else {
- ret = tracepoint_probe_unregister_noupdate(
- elem->tp_name,
- elem->tp_cb);
- /*
- * tracepoint_probe_update_all() must be called
- * before the module containing tp_cb is unloaded.
- */
- module_put(__module_text_address(
- (unsigned long)elem->tp_cb));
- }
- }
- elem->state = active;
-
- return ret;
-}
-
-/*
- * Disable a marker and its probe callback.
- * Note: only waiting an RCU period after setting elem->call to the empty
- * function insures that the original callback is not used anymore. This insured
- * by rcu_read_lock_sched around the call site.
- */
-static void disable_marker(struct marker *elem)
-{
- int ret;
-
- /* leave "call" as is. It is known statically. */
- if (elem->tp_name && elem->state) {
- WARN_ON(!elem->tp_cb);
- /*
- * It is ok to directly call the probe registration because type
- * checking has been done in the __trace_mark_tp() macro.
- */
- ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
- elem->tp_cb);
- WARN_ON(ret);
- /*
- * tracepoint_probe_update_all() must be called
- * before the module containing tp_cb is unloaded.
- */
- module_put(__module_text_address((unsigned long)elem->tp_cb));
- }
- elem->state = 0;
- elem->single.func = __mark_empty_function;
- /* Update the function before setting the ptype */
- smp_wmb();
- elem->ptype = 0; /* single probe */
- /*
- * Leave the private data and id there, because removal is racy and
- * should be done only after an RCU period. These are never used until
- * the next initialization anyway.
- */
-}
-
-/**
- * marker_update_probe_range - Update a probe range
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Updates the probe callback corresponding to a range of markers.
- */
-void marker_update_probe_range(struct marker *begin,
- struct marker *end)
-{
- struct marker *iter;
- struct marker_entry *mark_entry;
-
- mutex_lock(&markers_mutex);
- for (iter = begin; iter < end; iter++) {
- mark_entry = get_marker(iter->name);
- if (mark_entry) {
- set_marker(mark_entry, iter, !!mark_entry->refcount);
- /*
- * ignore error, continue
- */
- } else {
- disable_marker(iter);
- }
- }
- mutex_unlock(&markers_mutex);
-}
-
-/*
- * Update probes, removing the faulty probes.
- *
- * Internal callback only changed before the first probe is connected to it.
- * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
- * transitions. All other transitions will leave the old private data valid.
- * This makes the non-atomicity of the callback/private data updates valid.
- *
- * "special case" updates :
- * 0 -> 1 callback
- * 1 -> 0 callback
- * 1 -> 2 callbacks
- * 2 -> 1 callbacks
- * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
- * Site effect : marker_set_format may delete the marker entry (creating a
- * replacement).
- */
-static void marker_update_probes(void)
-{
- /* Core kernel markers */
- marker_update_probe_range(__start___markers, __stop___markers);
- /* Markers in modules. */
- module_update_markers();
- tracepoint_probe_update_all();
-}
-
-/**
- * marker_probe_register - Connect a probe to a marker
- * @name: marker name
- * @format: format string
- * @probe: probe handler
- * @probe_private: probe private data
- *
- * private data must be a valid allocated memory address, or NULL.
- * Returns 0 if ok, error value on error.
- * The probe address must at least be aligned on the architecture pointer size.
- */
-int marker_probe_register(const char *name, const char *format,
- marker_probe_func *probe, void *probe_private)
-{
- struct marker_entry *entry;
- int ret = 0;
- struct marker_probe_closure *old;
-
- mutex_lock(&markers_mutex);
- entry = get_marker(name);
- if (!entry) {
- entry = add_marker(name, format);
- if (IS_ERR(entry))
- ret = PTR_ERR(entry);
- } else if (format) {
- if (!entry->format)
- ret = marker_set_format(entry, format);
- else if (strcmp(entry->format, format))
- ret = -EPERM;
- }
- if (ret)
- goto end;
-
- /*
- * If we detect that a call_rcu is pending for this marker,
- * make sure it's executed now.
- */
- if (entry->rcu_pending)
- rcu_barrier_sched();
- old = marker_entry_add_probe(entry, probe, probe_private);
- if (IS_ERR(old)) {
- ret = PTR_ERR(old);
- goto end;
- }
- mutex_unlock(&markers_mutex);
- marker_update_probes();
- mutex_lock(&markers_mutex);
- entry = get_marker(name);
- if (!entry)
- goto end;
- if (entry->rcu_pending)
- rcu_barrier_sched();
- entry->oldptr = old;
- entry->rcu_pending = 1;
- /* write rcu_pending before calling the RCU callback */
- smp_wmb();
- call_rcu_sched(&entry->rcu, free_old_closure);
-end:
- mutex_unlock(&markers_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_register);
-
-/**
- * marker_probe_unregister - Disconnect a probe from a marker
- * @name: marker name
- * @probe: probe function pointer
- * @probe_private: probe private data
- *
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
- */
-int marker_probe_unregister(const char *name,
- marker_probe_func *probe, void *probe_private)
-{
- struct marker_entry *entry;
- struct marker_probe_closure *old;
- int ret = -ENOENT;
-
- mutex_lock(&markers_mutex);
- entry = get_marker(name);
- if (!entry)
- goto end;
- if (entry->rcu_pending)
- rcu_barrier_sched();
- old = marker_entry_remove_probe(entry, probe, probe_private);
- mutex_unlock(&markers_mutex);
- marker_update_probes();
- mutex_lock(&markers_mutex);
- entry = get_marker(name);
- if (!entry)
- goto end;
- if (entry->rcu_pending)
- rcu_barrier_sched();
- entry->oldptr = old;
- entry->rcu_pending = 1;
- /* write rcu_pending before calling the RCU callback */
- smp_wmb();
- call_rcu_sched(&entry->rcu, free_old_closure);
- remove_marker(name); /* Ignore busy error message */
- ret = 0;
-end:
- mutex_unlock(&markers_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_unregister);
-
-static struct marker_entry *
-get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
-{
- struct marker_entry *entry;
- unsigned int i;
- struct hlist_head *head;
- struct hlist_node *node;
-
- for (i = 0; i < MARKER_TABLE_SIZE; i++) {
- head = &marker_table[i];
- hlist_for_each_entry(entry, node, head, hlist) {
- if (!entry->ptype) {
- if (entry->single.func == probe
- && entry->single.probe_private
- == probe_private)
- return entry;
- } else {
- struct marker_probe_closure *closure;
- closure = entry->multi;
- for (i = 0; closure[i].func; i++) {
- if (closure[i].func == probe &&
- closure[i].probe_private
- == probe_private)
- return entry;
- }
- }
- }
- }
- return NULL;
-}
-
-/**
- * marker_probe_unregister_private_data - Disconnect a probe from a marker
- * @probe: probe function
- * @probe_private: probe private data
- *
- * Unregister a probe by providing the registered private data.
- * Only removes the first marker found in hash table.
- * Return 0 on success or error value.
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
- */
-int marker_probe_unregister_private_data(marker_probe_func *probe,
- void *probe_private)
-{
- struct marker_entry *entry;
- int ret = 0;
- struct marker_probe_closure *old;
-
- mutex_lock(&markers_mutex);
- entry = get_marker_from_private_data(probe, probe_private);
- if (!entry) {
- ret = -ENOENT;
- goto end;
- }
- if (entry->rcu_pending)
- rcu_barrier_sched();
- old = marker_entry_remove_probe(entry, NULL, probe_private);
- mutex_unlock(&markers_mutex);
- marker_update_probes();
- mutex_lock(&markers_mutex);
- entry = get_marker_from_private_data(probe, probe_private);
- if (!entry)
- goto end;
- if (entry->rcu_pending)
- rcu_barrier_sched();
- entry->oldptr = old;
- entry->rcu_pending = 1;
- /* write rcu_pending before calling the RCU callback */
- smp_wmb();
- call_rcu_sched(&entry->rcu, free_old_closure);
- remove_marker(entry->name); /* Ignore busy error message */
-end:
- mutex_unlock(&markers_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
-
-/**
- * marker_get_private_data - Get a marker's probe private data
- * @name: marker name
- * @probe: probe to match
- * @num: get the nth matching probe's private data
- *
- * Returns the nth private data pointer (starting from 0) matching, or an
- * ERR_PTR.
- * Returns the private data pointer, or an ERR_PTR.
- * The private data pointer should _only_ be dereferenced if the caller is the
- * owner of the data, or its content could vanish. This is mostly used to
- * confirm that a caller is the owner of a registered probe.
- */
-void *marker_get_private_data(const char *name, marker_probe_func *probe,
- int num)
-{
- struct hlist_head *head;
- struct hlist_node *node;
- struct marker_entry *e;
- size_t name_len = strlen(name) + 1;
- u32 hash = jhash(name, name_len-1, 0);
- int i;
-
- head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
- hlist_for_each_entry(e, node, head, hlist) {
- if (!strcmp(name, e->name)) {
- if (!e->ptype) {
- if (num == 0 && e->single.func == probe)
- return e->single.probe_private;
- } else {
- struct marker_probe_closure *closure;
- int match = 0;
- closure = e->multi;
- for (i = 0; closure[i].func; i++) {
- if (closure[i].func != probe)
- continue;
- if (match++ == num)
- return closure[i].probe_private;
- }
- }
- break;
- }
- }
- return ERR_PTR(-ENOENT);
-}
-EXPORT_SYMBOL_GPL(marker_get_private_data);
-
-#ifdef CONFIG_MODULES
-
-int marker_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
-{
- struct module *mod = data;
-
- switch (val) {
- case MODULE_STATE_COMING:
- marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers);
- break;
- case MODULE_STATE_GOING:
- marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers);
- break;
- }
- return 0;
-}
-
-struct notifier_block marker_module_nb = {
- .notifier_call = marker_module_notify,
- .priority = 0,
-};
-
-static int init_markers(void)
-{
- return register_module_notifier(&marker_module_nb);
-}
-__initcall(init_markers);
-
-#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 05ce49ced8f6..b6ee424245dd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2237,10 +2237,6 @@ static noinline struct module *load_module(void __user *umod,
sizeof(*mod->ctors), &mod->num_ctors);
#endif
-#ifdef CONFIG_MARKERS
- mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
- sizeof(*mod->markers), &mod->num_markers);
-#endif
#ifdef CONFIG_TRACEPOINTS
mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
"__tracepoints",
@@ -2958,20 +2954,6 @@ void module_layout(struct module *mod,
EXPORT_SYMBOL(module_layout);
#endif
-#ifdef CONFIG_MARKERS
-void module_update_markers(void)
-{
- struct module *mod;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(mod, &modules, list)
- if (!mod->taints)
- marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers);
- mutex_unlock(&module_mutex);
-}
-#endif
-
#ifdef CONFIG_TRACEPOINTS
void module_update_tracepoints(void)
{
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8cb94a52d1bb..cc768ab81ac8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2176,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
data->nr_pages = nr_pages;
atomic_set(&data->lock, -1);
+ if (counter->attr.watermark) {
+ data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+ counter->attr.wakeup_watermark);
+ }
+ if (!data->watermark)
+ data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
rcu_assign_pointer(counter->data, data);
return 0;
@@ -2315,7 +2322,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
lock_limit >>= PAGE_SHIFT;
locked = vma->vm_mm->locked_vm + extra;
- if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ !capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
}
@@ -2504,35 +2512,15 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
/*
* Output
*/
-
-struct perf_output_handle {
- struct perf_counter *counter;
- struct perf_mmap_data *data;
- unsigned long head;
- unsigned long offset;
- int nmi;
- int sample;
- int locked;
- unsigned long flags;
-};
-
-static bool perf_output_space(struct perf_mmap_data *data,
- unsigned int offset, unsigned int head)
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+ unsigned long offset, unsigned long head)
{
- unsigned long tail;
unsigned long mask;
if (!data->writable)
return true;
mask = (data->nr_pages << PAGE_SHIFT) - 1;
- /*
- * Userspace could choose to issue a mb() before updating the tail
- * pointer. So that all reads will be completed before the write is
- * issued.
- */
- tail = ACCESS_ONCE(data->user_page->data_tail);
- smp_rmb();
offset = (offset - tail) & mask;
head = (head - tail) & mask;
@@ -2633,8 +2621,8 @@ out:
local_irq_restore(handle->flags);
}
-static void perf_output_copy(struct perf_output_handle *handle,
- const void *buf, unsigned int len)
+void perf_output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
{
unsigned int pages_mask;
unsigned int offset;
@@ -2669,16 +2657,13 @@ static void perf_output_copy(struct perf_output_handle *handle,
WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}
-#define perf_output_put(handle, x) \
- perf_output_copy((handle), &(x), sizeof(x))
-
-static int perf_output_begin(struct perf_output_handle *handle,
- struct perf_counter *counter, unsigned int size,
- int nmi, int sample)
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_counter *counter, unsigned int size,
+ int nmi, int sample)
{
struct perf_counter *output_counter;
struct perf_mmap_data *data;
- unsigned int offset, head;
+ unsigned long tail, offset, head;
int have_lost;
struct {
struct perf_event_header header;
@@ -2716,16 +2701,23 @@ static int perf_output_begin(struct perf_output_handle *handle,
perf_output_lock(handle);
do {
+ /*
+ * Userspace could choose to issue a mb() before updating the
+ * tail pointer. So that all reads will be completed before the
+ * write is issued.
+ */
+ tail = ACCESS_ONCE(data->user_page->data_tail);
+ smp_rmb();
offset = head = atomic_long_read(&data->head);
head += size;
- if (unlikely(!perf_output_space(data, offset, head)))
+ if (unlikely(!perf_output_space(data, tail, offset, head)))
goto fail;
} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
handle->offset = offset;
handle->head = head;
- if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+ if (head - tail > data->watermark)
atomic_set(&data->wakeup, 1);
if (have_lost) {
@@ -2749,7 +2741,7 @@ out:
return -ENOSPC;
}
-static void perf_output_end(struct perf_output_handle *handle)
+void perf_output_end(struct perf_output_handle *handle)
{
struct perf_counter *counter = handle->counter;
struct perf_mmap_data *data = handle->data;
@@ -2863,156 +2855,176 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, counter);
}
-void perf_counter_output(struct perf_counter *counter, int nmi,
- struct perf_sample_data *data)
+void perf_output_sample(struct perf_output_handle *handle,
+ struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_counter *counter)
{
- int ret;
- u64 sample_type = counter->attr.sample_type;
- struct perf_output_handle handle;
- struct perf_event_header header;
- u64 ip;
- struct {
- u32 pid, tid;
- } tid_entry;
- struct perf_callchain_entry *callchain = NULL;
- int callchain_size = 0;
- u64 time;
- struct {
- u32 cpu, reserved;
- } cpu_entry;
+ u64 sample_type = data->type;
- header.type = PERF_EVENT_SAMPLE;
- header.size = sizeof(header);
+ perf_output_put(handle, *header);
- header.misc = 0;
- header.misc |= perf_misc_flags(data->regs);
-
- if (sample_type & PERF_SAMPLE_IP) {
- ip = perf_instruction_pointer(data->regs);
- header.size += sizeof(ip);
- }
-
- if (sample_type & PERF_SAMPLE_TID) {
- /* namespace issues */
- tid_entry.pid = perf_counter_pid(counter, current);
- tid_entry.tid = perf_counter_tid(counter, current);
-
- header.size += sizeof(tid_entry);
- }
+ if (sample_type & PERF_SAMPLE_IP)
+ perf_output_put(handle, data->ip);
- if (sample_type & PERF_SAMPLE_TIME) {
- /*
- * Maybe do better on x86 and provide cpu_clock_nmi()
- */
- time = sched_clock();
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
- header.size += sizeof(u64);
- }
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
if (sample_type & PERF_SAMPLE_ADDR)
- header.size += sizeof(u64);
+ perf_output_put(handle, data->addr);
if (sample_type & PERF_SAMPLE_ID)
- header.size += sizeof(u64);
+ perf_output_put(handle, data->id);
if (sample_type & PERF_SAMPLE_STREAM_ID)
- header.size += sizeof(u64);
-
- if (sample_type & PERF_SAMPLE_CPU) {
- header.size += sizeof(cpu_entry);
+ perf_output_put(handle, data->stream_id);
- cpu_entry.cpu = raw_smp_processor_id();
- cpu_entry.reserved = 0;
- }
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
if (sample_type & PERF_SAMPLE_PERIOD)
- header.size += sizeof(u64);
+ perf_output_put(handle, data->period);
if (sample_type & PERF_SAMPLE_READ)
- header.size += perf_counter_read_size(counter);
+ perf_output_read(handle, counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- callchain = perf_callchain(data->regs);
+ if (data->callchain) {
+ int size = 1;
- if (callchain) {
- callchain_size = (1 + callchain->nr) * sizeof(u64);
- header.size += callchain_size;
- } else
- header.size += sizeof(u64);
+ if (data->callchain)
+ size += data->callchain->nr;
+
+ size *= sizeof(u64);
+
+ perf_output_copy(handle, data->callchain, size);
+ } else {
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
}
if (sample_type & PERF_SAMPLE_RAW) {
- int size = sizeof(u32);
+ if (data->raw) {
+ perf_output_put(handle, data->raw->size);
+ perf_output_copy(handle, data->raw->data,
+ data->raw->size);
+ } else {
+ struct {
+ u32 size;
+ u32 data;
+ } raw = {
+ .size = sizeof(u32),
+ .data = 0,
+ };
+ perf_output_put(handle, raw);
+ }
+ }
+}
- if (data->raw)
- size += data->raw->size;
- else
- size += sizeof(u32);
+void perf_prepare_sample(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_counter *counter,
+ struct pt_regs *regs)
+{
+ u64 sample_type = counter->attr.sample_type;
- WARN_ON_ONCE(size & (sizeof(u64)-1));
- header.size += size;
- }
+ data->type = sample_type;
- ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
- if (ret)
- return;
+ header->type = PERF_EVENT_SAMPLE;
+ header->size = sizeof(*header);
- perf_output_put(&handle, header);
+ header->misc = 0;
+ header->misc |= perf_misc_flags(regs);
- if (sample_type & PERF_SAMPLE_IP)
- perf_output_put(&handle, ip);
+ if (sample_type & PERF_SAMPLE_IP) {
+ data->ip = perf_instruction_pointer(regs);
- if (sample_type & PERF_SAMPLE_TID)
- perf_output_put(&handle, tid_entry);
+ header->size += sizeof(data->ip);
+ }
- if (sample_type & PERF_SAMPLE_TIME)
- perf_output_put(&handle, time);
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ data->tid_entry.pid = perf_counter_pid(counter, current);
+ data->tid_entry.tid = perf_counter_tid(counter, current);
+
+ header->size += sizeof(data->tid_entry);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME) {
+ data->time = perf_clock();
+
+ header->size += sizeof(data->time);
+ }
if (sample_type & PERF_SAMPLE_ADDR)
- perf_output_put(&handle, data->addr);
+ header->size += sizeof(data->addr);
if (sample_type & PERF_SAMPLE_ID) {
- u64 id = primary_counter_id(counter);
+ data->id = primary_counter_id(counter);
- perf_output_put(&handle, id);
+ header->size += sizeof(data->id);
}
- if (sample_type & PERF_SAMPLE_STREAM_ID)
- perf_output_put(&handle, counter->id);
+ if (sample_type & PERF_SAMPLE_STREAM_ID) {
+ data->stream_id = counter->id;
- if (sample_type & PERF_SAMPLE_CPU)
- perf_output_put(&handle, cpu_entry);
+ header->size += sizeof(data->stream_id);
+ }
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ data->cpu_entry.cpu = raw_smp_processor_id();
+ data->cpu_entry.reserved = 0;
+
+ header->size += sizeof(data->cpu_entry);
+ }
if (sample_type & PERF_SAMPLE_PERIOD)
- perf_output_put(&handle, data->period);
+ header->size += sizeof(data->period);
if (sample_type & PERF_SAMPLE_READ)
- perf_output_read(&handle, counter);
+ header->size += perf_counter_read_size(counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- if (callchain)
- perf_output_copy(&handle, callchain, callchain_size);
- else {
- u64 nr = 0;
- perf_output_put(&handle, nr);
- }
+ int size = 1;
+
+ data->callchain = perf_callchain(regs);
+
+ if (data->callchain)
+ size += data->callchain->nr;
+
+ header->size += size * sizeof(u64);
}
if (sample_type & PERF_SAMPLE_RAW) {
- if (data->raw) {
- perf_output_put(&handle, data->raw->size);
- perf_output_copy(&handle, data->raw->data, data->raw->size);
- } else {
- struct {
- u32 size;
- u32 data;
- } raw = {
- .size = sizeof(u32),
- .data = 0,
- };
- perf_output_put(&handle, raw);
- }
+ int size = sizeof(u32);
+
+ if (data->raw)
+ size += data->raw->size;
+ else
+ size += sizeof(u32);
+
+ WARN_ON_ONCE(size & (sizeof(u64)-1));
+ header->size += size;
}
+}
+
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+
+ perf_prepare_sample(&header, data, counter, regs);
+
+ if (perf_output_begin(&handle, counter, header.size, nmi, 1))
+ return;
+
+ perf_output_sample(&handle, &header, data, counter);
perf_output_end(&handle);
}
@@ -3071,6 +3083,7 @@ struct perf_task_event {
u32 ppid;
u32 tid;
u32 ptid;
+ u64 time;
} event;
};
@@ -3078,9 +3091,12 @@ static void perf_counter_task_output(struct perf_counter *counter,
struct perf_task_event *task_event)
{
struct perf_output_handle handle;
- int size = task_event->event.header.size;
+ int size;
struct task_struct *task = task_event->task;
- int ret = perf_output_begin(&handle, counter, size, 0, 0);
+ int ret;
+
+ size = task_event->event.header.size;
+ ret = perf_output_begin(&handle, counter, size, 0, 0);
if (ret)
return;
@@ -3091,7 +3107,10 @@ static void perf_counter_task_output(struct perf_counter *counter,
task_event->event.tid = perf_counter_tid(counter, task);
task_event->event.ptid = perf_counter_tid(counter, current);
+ task_event->event.time = perf_clock();
+
perf_output_put(&handle, task_event->event);
+
perf_output_end(&handle);
}
@@ -3473,7 +3492,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
.misc = 0,
.size = sizeof(throttle_event),
},
- .time = sched_clock(),
+ .time = perf_clock(),
.id = primary_counter_id(counter),
.stream_id = counter->id,
};
@@ -3493,14 +3512,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
* Generic counter overflow handling, sampling.
*/
-int perf_counter_overflow(struct perf_counter *counter, int nmi,
- struct perf_sample_data *data)
+static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&counter->event_limit);
- int throttle = counter->pmu->unthrottle != NULL;
struct hw_perf_counter *hwc = &counter->hw;
int ret = 0;
+ throttle = (throttle && counter->pmu->unthrottle != NULL);
+
if (!throttle) {
hwc->interrupts++;
} else {
@@ -3523,7 +3544,7 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
}
if (counter->attr.freq) {
- u64 now = sched_clock();
+ u64 now = perf_clock();
s64 delta = now - hwc->freq_stamp;
hwc->freq_stamp = now;
@@ -3549,10 +3570,17 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
perf_counter_disable(counter);
}
- perf_counter_output(counter, nmi, data);
+ perf_counter_output(counter, nmi, data, regs);
return ret;
}
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return __perf_counter_overflow(counter, nmi, 1, data, regs);
+}
+
/*
* Generic software counter infrastructure
*/
@@ -3588,9 +3616,11 @@ again:
}
static void perf_swcounter_overflow(struct perf_counter *counter,
- int nmi, struct perf_sample_data *data)
+ int nmi, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct hw_perf_counter *hwc = &counter->hw;
+ int throttle = 0;
u64 overflow;
data->period = counter->hw.last_period;
@@ -3600,13 +3630,15 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
return;
for (; overflow; overflow--) {
- if (perf_counter_overflow(counter, nmi, data)) {
+ if (__perf_counter_overflow(counter, nmi, throttle,
+ data, regs)) {
/*
* We inhibit the overflow from happening when
* hwc->interrupts == MAX_INTERRUPTS.
*/
break;
}
+ throttle = 1;
}
}
@@ -3618,7 +3650,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter)
}
static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
- int nmi, struct perf_sample_data *data)
+ int nmi, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct hw_perf_counter *hwc = &counter->hw;
@@ -3627,11 +3660,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
if (!hwc->sample_period)
return;
- if (!data->regs)
+ if (!regs)
return;
if (!atomic64_add_negative(nr, &hwc->period_left))
- perf_swcounter_overflow(counter, nmi, data);
+ perf_swcounter_overflow(counter, nmi, data, regs);
}
static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3690,7 +3723,8 @@ static int perf_swcounter_match(struct perf_counter *counter,
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
enum perf_type_id type,
u32 event, u64 nr, int nmi,
- struct perf_sample_data *data)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_counter *counter;
@@ -3699,8 +3733,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
rcu_read_lock();
list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
- if (perf_swcounter_match(counter, type, event, data->regs))
- perf_swcounter_add(counter, nr, nmi, data);
+ if (perf_swcounter_match(counter, type, event, regs))
+ perf_swcounter_add(counter, nr, nmi, data, regs);
}
rcu_read_unlock();
}
@@ -3721,7 +3755,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
u64 nr, int nmi,
- struct perf_sample_data *data)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3734,7 +3769,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
barrier();
perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
- nr, nmi, data);
+ nr, nmi, data, regs);
rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
@@ -3742,7 +3777,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
*/
ctx = rcu_dereference(current->perf_counter_ctxp);
if (ctx)
- perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+ perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
rcu_read_unlock();
barrier();
@@ -3756,11 +3791,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data = {
- .regs = regs,
.addr = addr,
};
- do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+ do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
+ &data, regs);
}
static void perf_swcounter_read(struct perf_counter *counter)
@@ -3797,6 +3832,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
struct perf_sample_data data;
+ struct pt_regs *regs;
struct perf_counter *counter;
u64 period;
@@ -3804,17 +3840,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
counter->pmu->read(counter);
data.addr = 0;
- data.regs = get_irq_regs();
+ regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
*/
- if ((counter->attr.exclude_kernel || !data.regs) &&
+ if ((counter->attr.exclude_kernel || !regs) &&
!counter->attr.exclude_user)
- data.regs = task_pt_regs(current);
+ regs = task_pt_regs(current);
- if (data.regs) {
- if (perf_counter_overflow(counter, 0, &data))
+ if (regs) {
+ if (perf_counter_overflow(counter, 0, &data, regs))
ret = HRTIMER_NORESTART;
}
@@ -3950,15 +3986,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
};
struct perf_sample_data data = {
- .regs = get_irq_regs(),
.addr = addr,
.raw = &raw,
};
- if (!data.regs)
- data.regs = task_pt_regs(current);
+ struct pt_regs *regs = get_irq_regs();
+
+ if (!regs)
+ regs = task_pt_regs(current);
- do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+ do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+ &data, regs);
}
EXPORT_SYMBOL_GPL(perf_tpcounter_event);
@@ -4170,8 +4208,8 @@ done:
static int perf_copy_attr(struct perf_counter_attr __user *uattr,
struct perf_counter_attr *attr)
{
- int ret;
u32 size;
+ int ret;
if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
return -EFAULT;
@@ -4196,19 +4234,19 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
/*
* If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0.
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
*/
if (size > sizeof(*attr)) {
- unsigned long val;
- unsigned long __user *addr;
- unsigned long __user *end;
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
- addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
- sizeof(unsigned long));
- end = PTR_ALIGN((void __user *)uattr + size,
- sizeof(unsigned long));
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
- for (; addr < end; addr += sizeof(unsigned long)) {
+ for (; addr < end; addr++) {
ret = get_user(val, addr);
if (ret)
return ret;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
return 0;
}
+
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+{
+ *tp = current_kernel_time();
+ return 0;
+}
+
+static int posix_get_monotonic_coarse(clockid_t which_clock,
+ struct timespec *tp)
+{
+ *tp = get_monotonic_coarse();
+ return 0;
+}
+
+int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+{
+ *tp = ktime_to_timespec(KTIME_LOW_RES);
+ return 0;
+}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
.timer_create = no_timer_create,
.nsleep = no_nsleep,
};
+ struct k_clock clock_realtime_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_realtime_coarse,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ .nsleep = no_nsleep,
+ };
+ struct k_clock clock_monotonic_coarse = {
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_monotonic_coarse,
+ .clock_set = do_posix_clock_nosettime,
+ .timer_create = no_timer_create,
+ .nsleep = no_nsleep,
+ };
register_posix_clock(CLOCK_REALTIME, &clock_realtime);
register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+ register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+ register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
-static int disable_vt_switch;
-
-/*
- * Normally during a suspend, we allocate a new console and switch to it.
- * When we resume, we switch back to the original console. This switch
- * can be slow, so on systems where the framebuffer can handle restoration
- * of video registers anyways, there's little point in doing the console
- * switch. This function allows you to disable it by passing it '0'.
- */
-void pm_set_vt_switch(int do_switch)
-{
- acquire_console_sem();
- disable_vt_switch = !do_switch;
- release_console_sem();
-}
-EXPORT_SYMBOL(pm_set_vt_switch);
int pm_prepare_console(void)
{
- acquire_console_sem();
-
- if (disable_vt_switch) {
- release_console_sem();
- return 0;
- }
-
- orig_fgconsole = fg_console;
-
- if (vc_allocate(SUSPEND_CONSOLE)) {
- /* we can't have a free VC for now. Too bad,
- * we don't want to mess the screen for now. */
- release_console_sem();
+ orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
+ if (orig_fgconsole < 0)
return 1;
- }
- if (set_console(SUSPEND_CONSOLE)) {
- /*
- * We're unable to switch to the SUSPEND_CONSOLE.
- * Let the calling function know so it can decide
- * what to do.
- */
- release_console_sem();
- return 1;
- }
- release_console_sem();
-
- if (vt_waitactive(SUSPEND_CONSOLE)) {
- pr_debug("Suspend: Can't switch VCs.");
- return 1;
- }
orig_kmsg = kmsg_redirect;
kmsg_redirect = SUSPEND_CONSOLE;
return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
void pm_restore_console(void)
{
- acquire_console_sem();
- if (disable_vt_switch) {
- release_console_sem();
- return;
- }
- set_console(orig_fgconsole);
- release_console_sem();
-
- if (vt_waitactive(orig_fgconsole)) {
- pr_debug("Resume: Can't switch VCs.");
- return;
+ if (orig_fgconsole >= 0) {
+ vt_move_to_console(orig_fgconsole, 0);
+ kmsg_redirect = orig_kmsg;
}
-
- kmsg_redirect = orig_kmsg;
}
#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 419250ebec4d..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,48 +442,51 @@ void profile_tick(int type)
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <asm/uaccess.h>
-static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
{
- int len = cpumask_scnprintf(page, count, data);
- if (count - len < 2)
- return -EINVAL;
- len += sprintf(page + len, "\n");
- return len;
+ seq_cpumask(m, prof_cpu_mask);
+ seq_putc(m, '\n');
+ return 0;
}
-static int prof_cpu_mask_write_proc(struct file *file,
- const char __user *buffer, unsigned long count, void *data)
+static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, prof_cpu_mask_proc_show, NULL);
+}
+
+static ssize_t prof_cpu_mask_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
{
- struct cpumask *mask = data;
- unsigned long full_count = count, err;
cpumask_var_t new_value;
+ int err;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
err = cpumask_parse_user(buffer, count, new_value);
if (!err) {
- cpumask_copy(mask, new_value);
- err = full_count;
+ cpumask_copy(prof_cpu_mask, new_value);
+ err = count;
}
free_cpumask_var(new_value);
return err;
}
+static const struct file_operations prof_cpu_mask_proc_fops = {
+ .open = prof_cpu_mask_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = prof_cpu_mask_proc_write,
+};
+
void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
{
- struct proc_dir_entry *entry;
-
/* create /proc/irq/prof_cpu_mask */
- entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
- if (!entry)
- return;
- entry->data = prof_cpu_mask;
- entry->read_proc = prof_cpu_mask_read_proc;
- entry->write_proc = prof_cpu_mask_write_proc;
+ proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
}
/*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..ac2e1dc708bd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
__read_mostly int sched_clock_stable;
struct sched_clock_data {
- /*
- * Raw spinlock - this is a special case: this might be called
- * from within instrumentation code so we dont want to do any
- * instrumentation ourselves.
- */
- raw_spinlock_t lock;
-
u64 tick_raw;
u64 tick_gtod;
u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
for_each_possible_cpu(cpu) {
struct sched_clock_data *scd = cpu_sdc(cpu);
- scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
scd->tick_raw = 0;
scd->tick_gtod = ktime_now;
scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
+static u64 sched_clock_local(struct sched_clock_data *scd)
{
- s64 delta = now - scd->tick_raw;
- u64 clock, min_clock, max_clock;
+ u64 now, clock, old_clock, min_clock, max_clock;
+ s64 delta;
+again:
+ now = sched_clock();
+ delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
+ old_clock = scd->clock;
+
/*
* scd->clock = clamp(scd->tick_gtod + delta,
* max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
*/
clock = scd->tick_gtod + delta;
- min_clock = wrap_max(scd->tick_gtod, scd->clock);
- max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
+ min_clock = wrap_max(scd->tick_gtod, old_clock);
+ max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- scd->clock = clock;
+ if (cmpxchg(&scd->clock, old_clock, clock) != old_clock)
+ goto again;
- return scd->clock;
+ return clock;
}
-static void lock_double_clock(struct sched_clock_data *data1,
- struct sched_clock_data *data2)
+static u64 sched_clock_remote(struct sched_clock_data *scd)
{
- if (data1 < data2) {
- __raw_spin_lock(&data1->lock);
- __raw_spin_lock(&data2->lock);
+ struct sched_clock_data *my_scd = this_scd();
+ u64 this_clock, remote_clock;
+ u64 *ptr, old_val, val;
+
+ sched_clock_local(my_scd);
+again:
+ this_clock = my_scd->clock;
+ remote_clock = scd->clock;
+
+ /*
+ * Use the opportunity that we have both locks
+ * taken to couple the two clocks: we take the
+ * larger time as the latest time for both
+ * runqueues. (this creates monotonic movement)
+ */
+ if (likely((s64)(remote_clock - this_clock) < 0)) {
+ ptr = &scd->clock;
+ old_val = remote_clock;
+ val = this_clock;
} else {
- __raw_spin_lock(&data2->lock);
- __raw_spin_lock(&data1->lock);
+ /*
+ * Should be rare, but possible:
+ */
+ ptr = &my_scd->clock;
+ old_val = this_clock;
+ val = remote_clock;
}
+
+ if (cmpxchg(ptr, old_val, val) != old_val)
+ goto again;
+
+ return val;
}
u64 sched_clock_cpu(int cpu)
{
- u64 now, clock, this_clock, remote_clock;
struct sched_clock_data *scd;
+ u64 clock;
+
+ WARN_ON_ONCE(!irqs_disabled());
if (sched_clock_stable)
return sched_clock();
- scd = cpu_sdc(cpu);
-
- /*
- * Normally this is not called in NMI context - but if it is,
- * trying to do any locking here is totally lethal.
- */
- if (unlikely(in_nmi()))
- return scd->clock;
-
if (unlikely(!sched_clock_running))
return 0ull;
- WARN_ON_ONCE(!irqs_disabled());
- now = sched_clock();
-
- if (cpu != raw_smp_processor_id()) {
- struct sched_clock_data *my_scd = this_scd();
-
- lock_double_clock(scd, my_scd);
-
- this_clock = __update_sched_clock(my_scd, now);
- remote_clock = scd->clock;
-
- /*
- * Use the opportunity that we have both locks
- * taken to couple the two clocks: we take the
- * larger time as the latest time for both
- * runqueues. (this creates monotonic movement)
- */
- if (likely((s64)(remote_clock - this_clock) < 0)) {
- clock = this_clock;
- scd->clock = clock;
- } else {
- /*
- * Should be rare, but possible:
- */
- clock = remote_clock;
- my_scd->clock = remote_clock;
- }
-
- __raw_spin_unlock(&my_scd->lock);
- } else {
- __raw_spin_lock(&scd->lock);
- clock = __update_sched_clock(scd, now);
- }
+ scd = cpu_sdc(cpu);
- __raw_spin_unlock(&scd->lock);
+ if (cpu != smp_processor_id())
+ clock = sched_clock_remote(scd);
+ else
+ clock = sched_clock_local(scd);
return clock;
}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
now_gtod = ktime_to_ns(ktime_get());
now = sched_clock();
- __raw_spin_lock(&scd->lock);
scd->tick_raw = now;
scd->tick_gtod = now_gtod;
- __update_sched_clock(scd, now);
- __raw_spin_unlock(&scd->lock);
+ sched_clock_local(scd);
}
/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cd73738f0d5f..ecc637a0d591 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
+ trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
* 0 <= tv_nsec < NSEC_PER_SEC
* For negative values only the tv_sec field is negative !
*/
-void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
+void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
{
while (nsec >= NSEC_PER_SEC) {
+ /*
+ * The following asm() prevents the compiler from
+ * optimising this loop into a modulo operation. See
+ * also __iter_div_u64_rem() in include/linux/time.h
+ */
+ asm("" : "+rm"(nsec));
nsec -= NSEC_PER_SEC;
++sec;
}
while (nsec < 0) {
+ asm("" : "+rm"(nsec));
nsec += NSEC_PER_SEC;
--sec;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..09113347d328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
*
* TODO WishList:
* o Allow clocksource drivers to be unregistered
- * o get rid of clocksource_jiffies extern
*/
#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
#include <linux/module.h>
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
#include <linux/tick.h>
+#include <linux/kthread.h>
void timecounter_init(struct timecounter *tc,
const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
}
EXPORT_SYMBOL(timecounter_cyc2time);
-/* XXX - Would like a better way for initializing curr_clocksource */
-extern struct clocksource clocksource_jiffies;
-
/*[Clocksource internal variables]---------
* curr_clocksource:
- * currently selected clocksource. Initialized to clocksource_jiffies.
- * next_clocksource:
- * pending next selected clocksource.
+ * currently selected clocksource.
* clocksource_list:
* linked list with the registered clocksources
- * clocksource_lock:
- * protects manipulations to curr_clocksource and next_clocksource
- * and the clocksource_list
+ * clocksource_mutex:
+ * protects manipulations to curr_clocksource and the clocksource_list
* override_name:
* Name of the user-specified clocksource.
*/
-static struct clocksource *curr_clocksource = &clocksource_jiffies;
-static struct clocksource *next_clocksource;
-static struct clocksource *clocksource_override;
+static struct clocksource *curr_clocksource;
static LIST_HEAD(clocksource_list);
-static DEFINE_SPINLOCK(clocksource_lock);
+static DEFINE_MUTEX(clocksource_mutex);
static char override_name[32];
static int finished_booting;
-/* clocksource_done_booting - Called near the end of core bootup
- *
- * Hack to avoid lots of clocksource churn at boot time.
- * We use fs_initcall because we want this to start before
- * device_initcall but after subsys_initcall.
- */
-static int __init clocksource_done_booting(void)
-{
- finished_booting = 1;
- return 0;
-}
-fs_initcall(clocksource_done_booting);
-
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
+static void clocksource_watchdog_work(struct work_struct *work);
+
static LIST_HEAD(watchdog_list);
static struct clocksource *watchdog;
static struct timer_list watchdog_timer;
+static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static cycle_t watchdog_last;
-static unsigned long watchdog_resumed;
+static int watchdog_running;
+
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
/*
* Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
#define WATCHDOG_INTERVAL (HZ >> 1)
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
-static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
+static void clocksource_watchdog_work(struct work_struct *work)
{
- if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
- return;
+ /*
+ * If kthread_run fails the next watchdog scan over the
+ * watchdog_list will find the unstable clock again.
+ */
+ kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+static void __clocksource_unstable(struct clocksource *cs)
+{
+ cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+ cs->flags |= CLOCK_SOURCE_UNSTABLE;
+ if (finished_booting)
+ schedule_work(&watchdog_work);
+}
+
+static void clocksource_unstable(struct clocksource *cs, int64_t delta)
+{
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
cs->name, delta);
- cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
- clocksource_change_rating(cs, 0);
- list_del(&cs->wd_list);
+ __clocksource_unstable(cs);
+}
+
+/**
+ * clocksource_mark_unstable - mark clocksource unstable via watchdog
+ * @cs: clocksource to be marked unstable
+ *
+ * This function is called instead of clocksource_change_rating from
+ * cpu hotplug code to avoid a deadlock between the clocksource mutex
+ * and the cpu hotplug mutex. It defers the update of the clocksource
+ * to the watchdog thread.
+ */
+void clocksource_mark_unstable(struct clocksource *cs)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
+ if (list_empty(&cs->wd_list))
+ list_add(&cs->wd_list, &watchdog_list);
+ __clocksource_unstable(cs);
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
}
static void clocksource_watchdog(unsigned long data)
{
- struct clocksource *cs, *tmp;
+ struct clocksource *cs;
cycle_t csnow, wdnow;
int64_t wd_nsec, cs_nsec;
- int resumed;
+ int next_cpu;
spin_lock(&watchdog_lock);
-
- resumed = test_and_clear_bit(0, &watchdog_resumed);
+ if (!watchdog_running)
+ goto out;
wdnow = watchdog->read(watchdog);
- wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
+ wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
+ watchdog->mult, watchdog->shift);
watchdog_last = wdnow;
- list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
- csnow = cs->read(cs);
+ list_for_each_entry(cs, &watchdog_list, wd_list) {
- if (unlikely(resumed)) {
- cs->wd_last = csnow;
+ /* Clocksource already marked unstable? */
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ if (finished_booting)
+ schedule_work(&watchdog_work);
continue;
}
- /* Initialized ? */
+ csnow = cs->read(cs);
+
+ /* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
- if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
- (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
- cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
- /*
- * We just marked the clocksource as
- * highres-capable, notify the rest of the
- * system as well so that we transition
- * into high-res mode:
- */
- tick_clock_notify();
- }
cs->flags |= CLOCK_SOURCE_WATCHDOG;
cs->wd_last = csnow;
- } else {
- cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
- cs->wd_last = csnow;
- /* Check the delta. Might remove from the list ! */
- clocksource_ratewd(cs, cs_nsec - wd_nsec);
+ continue;
}
- }
- if (!list_empty(&watchdog_list)) {
- /*
- * Cycle through CPUs to check if the CPUs stay
- * synchronized to each other.
- */
- int next_cpu = cpumask_next(raw_smp_processor_id(),
- cpu_online_mask);
+ /* Check the deviation from the watchdog clocksource. */
+ cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+ cs->mask, cs->mult, cs->shift);
+ cs->wd_last = csnow;
+ if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+ clocksource_unstable(cs, cs_nsec - wd_nsec);
+ continue;
+ }
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(cpu_online_mask);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+ (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
+ (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ /*
+ * We just marked the clocksource as highres-capable,
+ * notify the rest of the system as well so that we
+ * transition into high-res mode:
+ */
+ tick_clock_notify();
+ }
}
+
+ /*
+ * Cycle through CPUs to check if the CPUs stay synchronized
+ * to each other.
+ */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, next_cpu);
+out:
spin_unlock(&watchdog_lock);
}
+
+static inline void clocksource_start_watchdog(void)
+{
+ if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+ return;
+ init_timer(&watchdog_timer);
+ watchdog_timer.function = clocksource_watchdog;
+ watchdog_last = watchdog->read(watchdog);
+ watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
+ add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+ watchdog_running = 1;
+}
+
+static inline void clocksource_stop_watchdog(void)
+{
+ if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+ return;
+ del_timer(&watchdog_timer);
+ watchdog_running = 0;
+}
+
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
static void clocksource_resume_watchdog(void)
{
- set_bit(0, &watchdog_resumed);
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ clocksource_reset_watchdog();
+ spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static void clocksource_check_watchdog(struct clocksource *cs)
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
{
- struct clocksource *cse;
unsigned long flags;
spin_lock_irqsave(&watchdog_lock, flags);
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
- int started = !list_empty(&watchdog_list);
-
+ /* cs is a clocksource to be watched. */
list_add(&cs->wd_list, &watchdog_list);
- if (!started && watchdog) {
- watchdog_last = watchdog->read(watchdog);
- watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer,
- cpumask_first(cpu_online_mask));
- }
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
} else {
+ /* cs is a watchdog. */
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
-
+ /* Pick the best watchdog. */
if (!watchdog || cs->rating > watchdog->rating) {
- if (watchdog)
- del_timer(&watchdog_timer);
watchdog = cs;
- init_timer(&watchdog_timer);
- watchdog_timer.function = clocksource_watchdog;
-
/* Reset watchdog cycles */
- list_for_each_entry(cse, &watchdog_list, wd_list)
- cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
- /* Start if list is not empty */
- if (!list_empty(&watchdog_list)) {
- watchdog_last = watchdog->read(watchdog);
- watchdog_timer.expires =
- jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer,
- cpumask_first(cpu_online_mask));
- }
+ clocksource_reset_watchdog();
+ }
+ }
+ /* Check if the watchdog timer needs to be started. */
+ clocksource_start_watchdog();
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_dequeue_watchdog(struct clocksource *cs)
+{
+ struct clocksource *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+ /* cs is a watched clocksource. */
+ list_del_init(&cs->wd_list);
+ } else if (cs == watchdog) {
+ /* Reset watchdog cycles */
+ clocksource_reset_watchdog();
+ /* Current watchdog is removed. Find an alternative. */
+ watchdog = NULL;
+ list_for_each_entry(tmp, &clocksource_list, list) {
+ if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
+ continue;
+ if (!watchdog || tmp->rating > watchdog->rating)
+ watchdog = tmp;
}
}
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+ /* Check if the watchdog timer needs to be stopped. */
+ clocksource_stop_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-#else
-static void clocksource_check_watchdog(struct clocksource *cs)
+
+static int clocksource_watchdog_kthread(void *data)
+{
+ struct clocksource *cs, *tmp;
+ unsigned long flags;
+ LIST_HEAD(unstable);
+
+ mutex_lock(&clocksource_mutex);
+ spin_lock_irqsave(&watchdog_lock, flags);
+ list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
+ if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+ list_del_init(&cs->wd_list);
+ list_add(&cs->wd_list, &unstable);
+ }
+ /* Check if the watchdog timer needs to be stopped. */
+ clocksource_stop_watchdog();
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+
+ /* Needs to be done outside of watchdog lock */
+ list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
+ list_del_init(&cs->wd_list);
+ __clocksource_change_rating(cs, 0);
+ }
+ mutex_unlock(&clocksource_mutex);
+ return 0;
+}
+
+#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
+
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
{
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
+static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
-#endif
+static inline int clocksource_watchdog_kthread(void *data) { return 0; }
+
+#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
/**
* clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,16 @@ static inline void clocksource_resume_watchdog(void) { }
void clocksource_resume(void)
{
struct clocksource *cs;
- unsigned long flags;
- spin_lock_irqsave(&clocksource_lock, flags);
+ mutex_lock(&clocksource_mutex);
- list_for_each_entry(cs, &clocksource_list, list) {
+ list_for_each_entry(cs, &clocksource_list, list)
if (cs->resume)
cs->resume();
- }
clocksource_resume_watchdog();
- spin_unlock_irqrestore(&clocksource_lock, flags);
+ mutex_unlock(&clocksource_mutex);
}
/**
@@ -320,75 +417,94 @@ void clocksource_touch_watchdog(void)
clocksource_resume_watchdog();
}
+#ifdef CONFIG_GENERIC_TIME
+
/**
- * clocksource_get_next - Returns the selected clocksource
+ * clocksource_select - Select the best clocksource available
+ *
+ * Private function. Must hold clocksource_mutex when called.
*
+ * Select the clocksource with the best rating, or the clocksource,
+ * which is selected by userspace override.
*/
-struct clocksource *clocksource_get_next(void)
+static void clocksource_select(void)
{
- unsigned long flags;
+ struct clocksource *best, *cs;
- spin_lock_irqsave(&clocksource_lock, flags);
- if (next_clocksource && finished_booting) {
- curr_clocksource = next_clocksource;
- next_clocksource = NULL;
+ if (!finished_booting || list_empty(&clocksource_list))
+ return;
+ /* First clocksource on the list has the best rating. */
+ best = list_first_entry(&clocksource_list, struct clocksource, list);
+ /* Check for the override clocksource. */
+ list_for_each_entry(cs, &clocksource_list, list) {
+ if (strcmp(cs->name, override_name) != 0)
+ continue;
+ /*
+ * Check to make sure we don't switch to a non-highres
+ * capable clocksource if the tick code is in oneshot
+ * mode (highres or nohz)
+ */
+ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+ tick_oneshot_mode_active()) {
+ /* Override clocksource cannot be used. */
+ printk(KERN_WARNING "Override clocksource %s is not "
+ "HRT compatible. Cannot switch while in "
+ "HRT/NOHZ mode\n", cs->name);
+ override_name[0] = 0;
+ } else
+ /* Override clocksource can be used. */
+ best = cs;
+ break;
+ }
+ if (curr_clocksource != best) {
+ printk(KERN_INFO "Switching to clocksource %s\n", best->name);
+ curr_clocksource = best;
+ timekeeping_notify(curr_clocksource);
}
- spin_unlock_irqrestore(&clocksource_lock, flags);
-
- return curr_clocksource;
}
-/**
- * select_clocksource - Selects the best registered clocksource.
- *
- * Private function. Must hold clocksource_lock when called.
+#else /* CONFIG_GENERIC_TIME */
+
+static inline void clocksource_select(void) { }
+
+#endif
+
+/*
+ * clocksource_done_booting - Called near the end of core bootup
*
- * Select the clocksource with the best rating, or the clocksource,
- * which is selected by userspace override.
+ * Hack to avoid lots of clocksource churn at boot time.
+ * We use fs_initcall because we want this to start before
+ * device_initcall but after subsys_initcall.
*/
-static struct clocksource *select_clocksource(void)
+static int __init clocksource_done_booting(void)
{
- struct clocksource *next;
-
- if (list_empty(&clocksource_list))
- return NULL;
-
- if (clocksource_override)
- next = clocksource_override;
- else
- next = list_entry(clocksource_list.next, struct clocksource,
- list);
+ finished_booting = 1;
- if (next == curr_clocksource)
- return NULL;
+ /*
+ * Run the watchdog first to eliminate unstable clock sources
+ */
+ clocksource_watchdog_kthread(NULL);
- return next;
+ mutex_lock(&clocksource_mutex);
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
+ return 0;
}
+fs_initcall(clocksource_done_booting);
/*
* Enqueue the clocksource sorted by rating
*/
-static int clocksource_enqueue(struct clocksource *c)
+static void clocksource_enqueue(struct clocksource *cs)
{
- struct list_head *tmp, *entry = &clocksource_list;
+ struct list_head *entry = &clocksource_list;
+ struct clocksource *tmp;
- list_for_each(tmp, &clocksource_list) {
- struct clocksource *cs;
-
- cs = list_entry(tmp, struct clocksource, list);
- if (cs == c)
- return -EBUSY;
+ list_for_each_entry(tmp, &clocksource_list, list)
/* Keep track of the place, where to insert */
- if (cs->rating >= c->rating)
- entry = tmp;
- }
- list_add(&c->list, entry);
-
- if (strlen(c->name) == strlen(override_name) &&
- !strcmp(c->name, override_name))
- clocksource_override = c;
-
- return 0;
+ if (tmp->rating >= cs->rating)
+ entry = &tmp->list;
+ list_add(&cs->list, entry);
}
/**
@@ -397,52 +513,48 @@ static int clocksource_enqueue(struct clocksource *c)
*
* Returns -EBUSY if registration fails, zero otherwise.
*/
-int clocksource_register(struct clocksource *c)
+int clocksource_register(struct clocksource *cs)
{
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&clocksource_lock, flags);
- ret = clocksource_enqueue(c);
- if (!ret)
- next_clocksource = select_clocksource();
- spin_unlock_irqrestore(&clocksource_lock, flags);
- if (!ret)
- clocksource_check_watchdog(c);
- return ret;
+ mutex_lock(&clocksource_mutex);
+ clocksource_enqueue(cs);
+ clocksource_select();
+ clocksource_enqueue_watchdog(cs);
+ mutex_unlock(&clocksource_mutex);
+ return 0;
}
EXPORT_SYMBOL(clocksource_register);
+static void __clocksource_change_rating(struct clocksource *cs, int rating)
+{
+ list_del(&cs->list);
+ cs->rating = rating;
+ clocksource_enqueue(cs);
+ clocksource_select();
+}
+
/**
* clocksource_change_rating - Change the rating of a registered clocksource
- *
*/
void clocksource_change_rating(struct clocksource *cs, int rating)
{
- unsigned long flags;
-
- spin_lock_irqsave(&clocksource_lock, flags);
- list_del(&cs->list);
- cs->rating = rating;
- clocksource_enqueue(cs);
- next_clocksource = select_clocksource();
- spin_unlock_irqrestore(&clocksource_lock, flags);
+ mutex_lock(&clocksource_mutex);
+ __clocksource_change_rating(cs, rating);
+ mutex_unlock(&clocksource_mutex);
}
+EXPORT_SYMBOL(clocksource_change_rating);
/**
* clocksource_unregister - remove a registered clocksource
*/
void clocksource_unregister(struct clocksource *cs)
{
- unsigned long flags;
-
- spin_lock_irqsave(&clocksource_lock, flags);
+ mutex_lock(&clocksource_mutex);
+ clocksource_dequeue_watchdog(cs);
list_del(&cs->list);
- if (clocksource_override == cs)
- clocksource_override = NULL;
- next_clocksource = select_clocksource();
- spin_unlock_irqrestore(&clocksource_lock, flags);
+ clocksource_select();
+ mutex_unlock(&clocksource_mutex);
}
+EXPORT_SYMBOL(clocksource_unregister);
#ifdef CONFIG_SYSFS
/**
@@ -458,9 +570,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
{
ssize_t count = 0;
- spin_lock_irq(&clocksource_lock);
+ mutex_lock(&clocksource_mutex);
count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
- spin_unlock_irq(&clocksource_lock);
+ mutex_unlock(&clocksource_mutex);
return count;
}
@@ -478,9 +590,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
struct sysdev_attribute *attr,
const char *buf, size_t count)
{
- struct clocksource *ovr = NULL;
size_t ret = count;
- int len;
/* strings from sysfs write are not 0 terminated! */
if (count >= sizeof(override_name))
@@ -490,44 +600,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
if (buf[count-1] == '\n')
count--;
- spin_lock_irq(&clocksource_lock);
+ mutex_lock(&clocksource_mutex);
if (count > 0)
memcpy(override_name, buf, count);
override_name[count] = 0;
+ clocksource_select();
- len = strlen(override_name);
- if (len) {
- struct clocksource *cs;
-
- ovr = clocksource_override;
- /* try to select it: */
- list_for_each_entry(cs, &clocksource_list, list) {
- if (strlen(cs->name) == len &&
- !strcmp(cs->name, override_name))
- ovr = cs;
- }
- }
-
- /*
- * Check to make sure we don't switch to a non-highres capable
- * clocksource if the tick code is in oneshot mode (highres or nohz)
- */
- if (tick_oneshot_mode_active() && ovr &&
- !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
- printk(KERN_WARNING "%s clocksource is not HRT compatible. "
- "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
- ovr = NULL;
- override_name[0] = 0;
- }
-
- /* Reselect, when the override name has changed */
- if (ovr != clocksource_override) {
- clocksource_override = ovr;
- next_clocksource = select_clocksource();
- }
-
- spin_unlock_irq(&clocksource_lock);
+ mutex_unlock(&clocksource_mutex);
return ret;
}
@@ -547,7 +627,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
struct clocksource *src;
ssize_t count = 0;
- spin_lock_irq(&clocksource_lock);
+ mutex_lock(&clocksource_mutex);
list_for_each_entry(src, &clocksource_list, list) {
/*
* Don't show non-HRES clocksource if the tick code is
@@ -559,7 +639,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
"%s ", src->name);
}
- spin_unlock_irq(&clocksource_lock);
+ mutex_unlock(&clocksource_mutex);
count += snprintf(buf + count,
max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +694,10 @@ device_initcall(init_clocksource_sysfs);
*/
static int __init boot_override_clocksource(char* str)
{
- unsigned long flags;
- spin_lock_irqsave(&clocksource_lock, flags);
+ mutex_lock(&clocksource_mutex);
if (str)
strlcpy(override_name, str, sizeof(override_name));
- spin_unlock_irqrestore(&clocksource_lock, flags);
+ mutex_unlock(&clocksource_mutex);
return 1;
}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
.read = jiffies_read,
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
- .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
.shift = JIFFIES_SHIFT,
};
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
}
core_initcall(init_jiffies_clocksource);
+
+struct clocksource * __init __weak clocksource_default_clock(void)
+{
+ return &clocksource_jiffies;
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
case TIME_OK:
break;
case TIME_INS:
- xtime.tv_sec--;
- wall_to_monotonic.tv_sec++;
+ timekeeping_leap_insert(-1);
time_state = TIME_OOP;
printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
res = HRTIMER_RESTART;
break;
case TIME_DEL:
- xtime.tv_sec++;
+ timekeeping_leap_insert(1);
time_tai--;
- wall_to_monotonic.tv_sec--;
time_state = TIME_WAIT;
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
time_state = TIME_OK;
break;
}
- update_vsyscall(&xtime, clock);
write_sequnlock(&xtime_lock);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..fb0f46fa1ecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -18,7 +18,117 @@
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/tick.h>
+#include <linux/stop_machine.h>
+
+/* Structure holding internal timekeeping values. */
+struct timekeeper {
+ /* Current clocksource used for timekeeping. */
+ struct clocksource *clock;
+ /* The shift value of the current clocksource. */
+ int shift;
+
+ /* Number of clock cycles in one NTP interval. */
+ cycle_t cycle_interval;
+ /* Number of clock shifted nano seconds in one NTP interval. */
+ u64 xtime_interval;
+ /* Raw nano seconds accumulated per NTP interval. */
+ u32 raw_interval;
+
+ /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
+ u64 xtime_nsec;
+ /* Difference between accumulated time and NTP time in ntp
+ * shifted nano seconds. */
+ s64 ntp_error;
+ /* Shift conversion between clock shifted nano seconds and
+ * ntp shifted nano seconds. */
+ int ntp_error_shift;
+ /* NTP adjusted clock multiplier */
+ u32 mult;
+};
+
+struct timekeeper timekeeper;
+
+/**
+ * timekeeper_setup_internals - Set up internals to use clocksource clock.
+ *
+ * @clock: Pointer to clocksource.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timekeeping code, you should not be using this!
+ */
+static void timekeeper_setup_internals(struct clocksource *clock)
+{
+ cycle_t interval;
+ u64 tmp;
+
+ timekeeper.clock = clock;
+ clock->cycle_last = clock->read(clock);
+ /* Do the ns -> cycle conversion first, using original mult */
+ tmp = NTP_INTERVAL_LENGTH;
+ tmp <<= clock->shift;
+ tmp += clock->mult/2;
+ do_div(tmp, clock->mult);
+ if (tmp == 0)
+ tmp = 1;
+
+ interval = (cycle_t) tmp;
+ timekeeper.cycle_interval = interval;
+
+ /* Go back from cycles -> shifted ns */
+ timekeeper.xtime_interval = (u64) interval * clock->mult;
+ timekeeper.raw_interval =
+ ((u64) interval * clock->mult) >> clock->shift;
+
+ timekeeper.xtime_nsec = 0;
+ timekeeper.shift = clock->shift;
+
+ timekeeper.ntp_error = 0;
+ timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+
+ /*
+ * The timekeeper keeps its own mult values for the currently
+ * active clocksource. These value will be adjusted via NTP
+ * to counteract clock drifting.
+ */
+ timekeeper.mult = clock->mult;
+}
+
+/* Timekeeper helper functions. */
+static inline s64 timekeeping_get_ns(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
+
+ /* read clocksource: */
+ clock = timekeeper.clock;
+ cycle_now = clock->read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* return delta convert to nanoseconds using ntp adjusted mult. */
+ return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+ timekeeper.shift);
+}
+
+static inline s64 timekeeping_get_ns_raw(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
+
+ /* read clocksource: */
+ clock = timekeeper.clock;
+ cycle_now = clock->read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* return delta convert to nanoseconds using ntp adjusted mult. */
+ return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+}
/*
* This read-write spinlock protects us from races in SMP while
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
*/
struct timespec xtime __attribute__ ((aligned (16)));
struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-static unsigned long total_sleep_time; /* seconds */
+static struct timespec total_sleep_time;
+
+/*
+ * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
+ */
+struct timespec raw_time;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
timespec_add_ns(&xtime_cache, nsec);
}
-struct clocksource *clock;
-
+/* must hold xtime_lock */
+void timekeeping_leap_insert(int leapsecond)
+{
+ xtime.tv_sec += leapsecond;
+ wall_to_monotonic.tv_sec -= leapsecond;
+ update_vsyscall(&xtime, timekeeper.clock);
+}
#ifdef CONFIG_GENERIC_TIME
+
/**
- * clocksource_forward_now - update clock to the current time
+ * timekeeping_forward_now - update clock to the current time
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
-static void clocksource_forward_now(void)
+static void timekeeping_forward_now(void)
{
cycle_t cycle_now, cycle_delta;
+ struct clocksource *clock;
s64 nsec;
- cycle_now = clocksource_read(clock);
+ clock = timekeeper.clock;
+ cycle_now = clock->read(clock);
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
clock->cycle_last = cycle_now;
- nsec = cyc2ns(clock, cycle_delta);
+ nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+ timekeeper.shift);
/* If arch requires, add in gettimeoffset() */
nsec += arch_gettimeoffset();
timespec_add_ns(&xtime, nsec);
- nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
- clock->raw_time.tv_nsec += nsec;
+ nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+ timespec_add_ns(&raw_time, nsec);
}
/**
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
*/
void getnstimeofday(struct timespec *ts)
{
- cycle_t cycle_now, cycle_delta;
unsigned long seq;
s64 nsecs;
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
seq = read_seqbegin(&xtime_lock);
*ts = xtime;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
- nsecs = cyc2ns(clock, cycle_delta);
+ nsecs = timekeeping_get_ns();
/* If arch requires, add in gettimeoffset() */
nsecs += arch_gettimeoffset();
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
EXPORT_SYMBOL(getnstimeofday);
+ktime_t ktime_get(void)
+{
+ unsigned int seq;
+ s64 secs, nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
+ nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
+ nsecs += timekeeping_get_ns();
+
+ } while (read_seqretry(&xtime_lock, seq));
+ /*
+ * Use ktime_set/ktime_add_ns to create a proper ktime on
+ * 32-bit architectures without CONFIG_KTIME_SCALAR.
+ */
+ return ktime_add_ns(ktime_set(secs, 0), nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ * @ts: pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+ struct timespec tomono;
+ unsigned int seq;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ *ts = xtime;
+ tomono = wall_to_monotonic;
+ nsecs = timekeeping_get_ns();
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+ ts->tv_nsec + tomono.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_ts);
+
/**
* do_gettimeofday - Returns the time of day in a timeval
* @tv: pointer to the timeval to be set
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
write_seqlock_irqsave(&xtime_lock, flags);
- clocksource_forward_now();
+ timekeeping_forward_now();
ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
update_xtime_cache(0);
- clock->error = 0;
+ timekeeper.ntp_error = 0;
ntp_clear();
- update_vsyscall(&xtime, clock);
+ update_vsyscall(&xtime, timekeeper.clock);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
*
* Accumulates current time interval and initializes new clocksource
*/
-static void change_clocksource(void)
+static int change_clocksource(void *data)
{
struct clocksource *new, *old;
- new = clocksource_get_next();
+ new = (struct clocksource *) data;
+
+ timekeeping_forward_now();
+ if (!new->enable || new->enable(new) == 0) {
+ old = timekeeper.clock;
+ timekeeper_setup_internals(new);
+ if (old->disable)
+ old->disable(old);
+ }
+ return 0;
+}
- if (clock == new)
+/**
+ * timekeeping_notify - Install a new clock source
+ * @clock: pointer to the clock source
+ *
+ * This function is called from clocksource.c after a new, better clock
+ * source has been registered. The caller holds the clocksource_mutex.
+ */
+void timekeeping_notify(struct clocksource *clock)
+{
+ if (timekeeper.clock == clock)
return;
+ stop_machine(change_clocksource, clock, NULL);
+ tick_clock_notify();
+}
- clocksource_forward_now();
+#else /* GENERIC_TIME */
- if (clocksource_enable(new))
- return;
+static inline void timekeeping_forward_now(void) { }
- new->raw_time = clock->raw_time;
- old = clock;
- clock = new;
- clocksource_disable(old);
+/**
+ * ktime_get - get the monotonic time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get(void)
+{
+ struct timespec now;
- clock->cycle_last = 0;
- clock->cycle_last = clocksource_read(clock);
- clock->error = 0;
- clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+ ktime_get_ts(&now);
- tick_clock_notify();
+ return timespec_to_ktime(now);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
- /*
- * We're holding xtime lock and waking up klogd would deadlock
- * us on enqueue. So no printing!
- printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
- */
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ * @ts: pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+ struct timespec tomono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ getnstimeofday(ts);
+ tomono = wall_to_monotonic;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+ ts->tv_nsec + tomono.tv_nsec);
}
-#else
-static inline void clocksource_forward_now(void) { }
-static inline void change_clocksource(void) { }
-#endif
+EXPORT_SYMBOL_GPL(ktime_get_ts);
+
+#endif /* !GENERIC_TIME */
+
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get_real(void)
+{
+ struct timespec now;
+
+ getnstimeofday(&now);
+
+ return timespec_to_ktime(now);
+}
+EXPORT_SYMBOL_GPL(ktime_get_real);
/**
* getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
{
unsigned long seq;
s64 nsecs;
- cycle_t cycle_now, cycle_delta;
do {
seq = read_seqbegin(&xtime_lock);
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
- nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
-
- *ts = clock->raw_time;
+ nsecs = timekeeping_get_ns_raw();
+ *ts = raw_time;
} while (read_seqretry(&xtime_lock, seq));
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
do {
seq = read_seqbegin(&xtime_lock);
- ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
} while (read_seqretry(&xtime_lock, seq));
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
}
/**
- * read_persistent_clock - Return time in seconds from the persistent clock.
+ * read_persistent_clock - Return time from the persistent clock.
*
* Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
- * Returns zero if unsupported.
+ * Reads the time from the battery backed persistent clock.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
*
* XXX - Do be sure to remove it once all arches implement it.
*/
-unsigned long __attribute__((weak)) read_persistent_clock(void)
+void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
{
- return 0;
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+}
+
+/**
+ * read_boot_clock - Return time of the system start.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Function to read the exact time the system has been started.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+void __attribute__((weak)) read_boot_clock(struct timespec *ts)
+{
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
}
/*
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
*/
void __init timekeeping_init(void)
{
+ struct clocksource *clock;
unsigned long flags;
- unsigned long sec = read_persistent_clock();
+ struct timespec now, boot;
+
+ read_persistent_clock(&now);
+ read_boot_clock(&boot);
write_seqlock_irqsave(&xtime_lock, flags);
ntp_init();
- clock = clocksource_get_next();
- clocksource_enable(clock);
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
- clock->cycle_last = clocksource_read(clock);
-
- xtime.tv_sec = sec;
- xtime.tv_nsec = 0;
+ clock = clocksource_default_clock();
+ if (clock->enable)
+ clock->enable(clock);
+ timekeeper_setup_internals(clock);
+
+ xtime.tv_sec = now.tv_sec;
+ xtime.tv_nsec = now.tv_nsec;
+ raw_time.tv_sec = 0;
+ raw_time.tv_nsec = 0;
+ if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
+ boot.tv_sec = xtime.tv_sec;
+ boot.tv_nsec = xtime.tv_nsec;
+ }
set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
+ -boot.tv_sec, -boot.tv_nsec);
update_xtime_cache(0);
- total_sleep_time = 0;
+ total_sleep_time.tv_sec = 0;
+ total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
}
/* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
+static struct timespec timekeeping_suspend_time;
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
static int timekeeping_resume(struct sys_device *dev)
{
unsigned long flags;
- unsigned long now = read_persistent_clock();
+ struct timespec ts;
+
+ read_persistent_clock(&ts);
clocksource_resume();
write_seqlock_irqsave(&xtime_lock, flags);
- if (now && (now > timekeeping_suspend_time)) {
- unsigned long sleep_length = now - timekeeping_suspend_time;
-
- xtime.tv_sec += sleep_length;
- wall_to_monotonic.tv_sec -= sleep_length;
- total_sleep_time += sleep_length;
+ if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
+ ts = timespec_sub(ts, timekeeping_suspend_time);
+ xtime = timespec_add_safe(xtime, ts);
+ wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
+ total_sleep_time = timespec_add_safe(total_sleep_time, ts);
}
update_xtime_cache(0);
/* re-base the last cycle value */
- clock->cycle_last = 0;
- clock->cycle_last = clocksource_read(clock);
- clock->error = 0;
+ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
+ timekeeper.ntp_error = 0;
timekeeping_suspended = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
{
unsigned long flags;
- timekeeping_suspend_time = read_persistent_clock();
+ read_persistent_clock(&timekeeping_suspend_time);
write_seqlock_irqsave(&xtime_lock, flags);
- clocksource_forward_now();
+ timekeeping_forward_now();
timekeeping_suspended = 1;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
* If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
s64 *offset)
{
s64 tick_error, i;
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* here. This is tuned so that an error of about 1 msec is adjusted
* within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
*/
- error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
error2 = abs(error2);
for (look_ahead = 0; error2 > 0; look_ahead++)
error2 >>= 2;
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* Now calculate the error in (1 << look_ahead) ticks, but first
* remove the single look ahead already included in the error.
*/
- tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
- tick_error -= clock->xtime_interval >> 1;
+ tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
+ tick_error -= timekeeper.xtime_interval >> 1;
error = ((error - tick_error) >> look_ahead) + tick_error;
/* Finally calculate the adjustment shift value. */
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* this is optimized for the most common adjustments of -1,0,1,
* for other values we can do a bit more work.
*/
-static void clocksource_adjust(s64 offset)
+static void timekeeping_adjust(s64 offset)
{
- s64 error, interval = clock->cycle_interval;
+ s64 error, interval = timekeeper.cycle_interval;
int adj;
- error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
+ error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
if (error > interval) {
error >>= 2;
if (likely(error <= interval))
adj = 1;
else
- adj = clocksource_bigadjust(error, &interval, &offset);
+ adj = timekeeping_bigadjust(error, &interval, &offset);
} else if (error < -interval) {
error >>= 2;
if (likely(error >= -interval)) {
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
interval = -interval;
offset = -offset;
} else
- adj = clocksource_bigadjust(error, &interval, &offset);
+ adj = timekeeping_bigadjust(error, &interval, &offset);
} else
return;
- clock->mult += adj;
- clock->xtime_interval += interval;
- clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) <<
- (NTP_SCALE_SHIFT - clock->shift);
+ timekeeper.mult += adj;
+ timekeeper.xtime_interval += interval;
+ timekeeper.xtime_nsec -= offset;
+ timekeeper.ntp_error -= (interval - offset) <<
+ timekeeper.ntp_error_shift;
}
/**
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
*/
void update_wall_time(void)
{
+ struct clocksource *clock;
cycle_t offset;
+ u64 nsecs;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return;
+ clock = timekeeper.clock;
#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+ offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#else
- offset = clock->cycle_interval;
+ offset = timekeeper.cycle_interval;
#endif
- clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
+ timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
*/
- while (offset >= clock->cycle_interval) {
+ while (offset >= timekeeper.cycle_interval) {
+ u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+
/* accumulate one interval */
- offset -= clock->cycle_interval;
- clock->cycle_last += clock->cycle_interval;
+ offset -= timekeeper.cycle_interval;
+ clock->cycle_last += timekeeper.cycle_interval;
- clock->xtime_nsec += clock->xtime_interval;
- if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
- clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+ timekeeper.xtime_nsec += timekeeper.xtime_interval;
+ if (timekeeper.xtime_nsec >= nsecps) {
+ timekeeper.xtime_nsec -= nsecps;
xtime.tv_sec++;
second_overflow();
}
- clock->raw_time.tv_nsec += clock->raw_interval;
- if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
- clock->raw_time.tv_nsec -= NSEC_PER_SEC;
- clock->raw_time.tv_sec++;
+ raw_time.tv_nsec += timekeeper.raw_interval;
+ if (raw_time.tv_nsec >= NSEC_PER_SEC) {
+ raw_time.tv_nsec -= NSEC_PER_SEC;
+ raw_time.tv_sec++;
}
/* accumulate error between NTP and clock interval */
- clock->error += tick_length;
- clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
+ timekeeper.ntp_error += tick_length;
+ timekeeper.ntp_error -= timekeeper.xtime_interval <<
+ timekeeper.ntp_error_shift;
}
/* correct the clock when NTP error is too big */
- clocksource_adjust(offset);
+ timekeeping_adjust(offset);
/*
* Since in the loop above, we accumulate any amount of time
* in xtime_nsec over a second into xtime.tv_sec, its possible for
* xtime_nsec to be fairly small after the loop. Further, if we're
- * slightly speeding the clocksource up in clocksource_adjust(),
+ * slightly speeding the clocksource up in timekeeping_adjust(),
* its possible the required corrective factor to xtime_nsec could
* cause it to underflow.
*
@@ -550,24 +792,25 @@ void update_wall_time(void)
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
- if (unlikely((s64)clock->xtime_nsec < 0)) {
- s64 neg = -(s64)clock->xtime_nsec;
- clock->xtime_nsec = 0;
- clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+ if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
+ s64 neg = -(s64)timekeeper.xtime_nsec;
+ timekeeper.xtime_nsec = 0;
+ timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
}
/* store full nanoseconds into xtime after rounding it up and
* add the remainder to the error difference.
*/
- xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
- clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
- clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
+ xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
+ timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
+ timekeeper.ntp_error += timekeeper.xtime_nsec <<
+ timekeeper.ntp_error_shift;
- update_xtime_cache(cyc2ns(clock, offset));
+ nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
+ update_xtime_cache(nsecs);
/* check to see if there is a new clocksource to use */
- change_clocksource();
- update_vsyscall(&xtime, clock);
+ update_vsyscall(&xtime, timekeeper.clock);
}
/**
@@ -583,9 +826,12 @@ void update_wall_time(void)
*/
void getboottime(struct timespec *ts)
{
- set_normalized_timespec(ts,
- - (wall_to_monotonic.tv_sec + total_sleep_time),
- - wall_to_monotonic.tv_nsec);
+ struct timespec boottime = {
+ .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
+ .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
+ };
+
+ set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
}
/**
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
*/
void monotonic_to_bootbased(struct timespec *ts)
{
- ts->tv_sec += total_sleep_time;
+ *ts = timespec_add_safe(*ts, total_sleep_time);
}
unsigned long get_seconds(void)
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
}
EXPORT_SYMBOL(get_seconds);
+struct timespec __current_kernel_time(void)
+{
+ return xtime_cache;
+}
struct timespec current_kernel_time(void)
{
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
return now;
}
EXPORT_SYMBOL(current_kernel_time);
+
+struct timespec get_monotonic_coarse(void)
+{
+ struct timespec now, mono;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ now = xtime_cache;
+ mono = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+ now.tv_nsec + mono.tv_nsec);
+ return now;
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index a3d25f415019..bbb51074680e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -72,6 +72,7 @@ struct tvec_base {
spinlock_t lock;
struct timer_list *running_timer;
unsigned long timer_jiffies;
+ unsigned long next_timer;
struct tvec_root tv1;
struct tvec tv2;
struct tvec tv3;
@@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
if (timer_pending(timer)) {
detach_timer(timer, 0);
+ if (timer->expires == base->next_timer &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = base->timer_jiffies;
ret = 1;
} else {
if (pending_only)
@@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
}
timer->expires = expires;
+ if (time_before(timer->expires, base->next_timer) &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = timer->expires;
internal_add_timer(base, timer);
out_unlock:
@@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
spin_lock_irqsave(&base->lock, flags);
timer_set_base(timer, base);
debug_timer_activate(timer);
+ if (time_before(timer->expires, base->next_timer) &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = timer->expires;
internal_add_timer(base, timer);
/*
* Check whether the other CPU is idle and needs to be
@@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer)
base = lock_timer_base(timer, &flags);
if (timer_pending(timer)) {
detach_timer(timer, 1);
+ if (timer->expires == base->next_timer &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = base->timer_jiffies;
ret = 1;
}
spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
ret = 0;
if (timer_pending(timer)) {
detach_timer(timer, 1);
+ if (timer->expires == base->next_timer &&
+ !tbase_get_deferrable(timer->base))
+ base->next_timer = base->timer_jiffies;
ret = 1;
}
out:
@@ -1007,8 +1023,8 @@ static inline void __run_timers(struct tvec_base *base)
#ifdef CONFIG_NO_HZ
/*
* Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a cpus is idle.
- * This functions needs to be called disabled.
+ * is used on S/390 to stop all activity when a CPU is idle.
+ * This function needs to be called with interrupts disabled.
*/
static unsigned long __next_timer_interrupt(struct tvec_base *base)
{
@@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
unsigned long expires;
spin_lock(&base->lock);
- expires = __next_timer_interrupt(base);
+ if (time_before_eq(base->next_timer, base->timer_jiffies))
+ base->next_timer = __next_timer_interrupt(base);
+ expires = base->next_timer;
spin_unlock(&base->lock);
if (time_before_eq(expires, now))
@@ -1522,6 +1540,7 @@ static int __cpuinit init_timers_cpu(int cpu)
INIT_LIST_HEAD(base->tv1.vec + j);
base->timer_jiffies = jiffies;
+ base->next_timer = base->timer_jiffies;
return 0;
}
@@ -1534,6 +1553,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
timer = list_first_entry(head, struct timer_list, entry);
detach_timer(timer, 0);
timer_set_base(timer, new_base);
+ if (time_before(timer->expires, new_base->next_timer) &&
+ !tbase_get_deferrable(timer->base))
+ new_base->next_timer = timer->expires;
internal_add_timer(new_base, timer);
}
}
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..26f03ac07c2b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
-obj-$(CONFIG_POWER_TRACER) += trace_power.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += power-traces.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc615f84751b..c71e91bf7372 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2414,11 +2414,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
static void *
__g_next(struct seq_file *m, loff_t *pos)
{
- unsigned long *array = m->private;
-
if (*pos >= ftrace_graph_count)
return NULL;
- return &array[*pos];
+ return &ftrace_graph_funcs[*pos];
}
static void *
@@ -2482,16 +2480,10 @@ ftrace_graph_open(struct inode *inode, struct file *file)
ftrace_graph_count = 0;
memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
}
+ mutex_unlock(&graph_lock);
- if (file->f_mode & FMODE_READ) {
+ if (file->f_mode & FMODE_READ)
ret = seq_open(file, &ftrace_graph_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = ftrace_graph_funcs;
- }
- } else
- file->private_data = ftrace_graph_funcs;
- mutex_unlock(&graph_lock);
return ret;
}
@@ -2560,7 +2552,6 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_parser parser;
- unsigned long *array;
size_t read = 0;
ssize_t ret;
@@ -2574,12 +2565,6 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
goto out;
}
- if (file->f_mode & FMODE_READ) {
- struct seq_file *m = file->private_data;
- array = m->private;
- } else
- array = file->private_data;
-
if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
ret = -ENOMEM;
goto out;
@@ -2591,7 +2576,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
parser.buffer[parser.idx] = 0;
/* we allow only one expression at a time */
- ret = ftrace_set_func(array, &ftrace_graph_count,
+ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
parser.buffer);
if (ret)
goto out;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6eef38923b07..d4ff01970547 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
}
EXPORT_SYMBOL_GPL(tracing_is_on);
-#include "trace.h"
-
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 420232a1fbba..a35925d222ba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,13 +125,13 @@ int ftrace_dump_on_oops;
static int tracing_set_tracer(const char *buf);
-#define BOOTUP_TRACER_SIZE 100
-static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
+#define MAX_TRACER_SIZE 100
+static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static int __init set_ftrace(char *str)
{
- strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
+ strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
ring_buffer_expanded = 1;
@@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly;
static struct tracer *current_trace __read_mostly;
/*
- * max_tracer_type_len is used to simplify the allocating of
- * buffers to read userspace tracer names. We keep track of
- * the longest tracer name registered.
- */
-static int max_tracer_type_len;
-
-/*
* trace_types_lock is used to protect the trace_types list.
* This lock is also used to keep user access serialized.
* Accesses from userspace will grab this lock while userspace
@@ -625,7 +618,6 @@ __releases(kernel_lock)
__acquires(kernel_lock)
{
struct tracer *t;
- int len;
int ret = 0;
if (!type->name) {
@@ -633,6 +625,11 @@ __acquires(kernel_lock)
return -1;
}
+ if (strlen(type->name) > MAX_TRACER_SIZE) {
+ pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
+ return -1;
+ }
+
/*
* When this gets called we hold the BKL which means that
* preemption is disabled. Various trace selftests however
@@ -647,7 +644,7 @@ __acquires(kernel_lock)
for (t = trace_types; t; t = t->next) {
if (strcmp(type->name, t->name) == 0) {
/* already found */
- pr_info("Trace %s already registered\n",
+ pr_info("Tracer %s already registered\n",
type->name);
ret = -1;
goto out;
@@ -698,9 +695,6 @@ __acquires(kernel_lock)
type->next = trace_types;
trace_types = type;
- len = strlen(type->name);
- if (len > max_tracer_type_len)
- max_tracer_type_len = len;
out:
tracing_selftest_running = false;
@@ -709,7 +703,7 @@ __acquires(kernel_lock)
if (ret || !default_bootup_tracer)
goto out_unlock;
- if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
+ if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
goto out_unlock;
printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -731,14 +725,13 @@ __acquires(kernel_lock)
void unregister_tracer(struct tracer *type)
{
struct tracer **t;
- int len;
mutex_lock(&trace_types_lock);
for (t = &trace_types; *t; t = &(*t)->next) {
if (*t == type)
goto found;
}
- pr_info("Trace %s not registered\n", type->name);
+ pr_info("Tracer %s not registered\n", type->name);
goto out;
found:
@@ -751,17 +744,7 @@ void unregister_tracer(struct tracer *type)
current_trace->stop(&global_trace);
current_trace = &nop_trace;
}
-
- if (strlen(type->name) != max_tracer_type_len)
- goto out;
-
- max_tracer_type_len = 0;
- for (t = &trace_types; *t; t = &(*t)->next) {
- len = strlen((*t)->name);
- if (len > max_tracer_type_len)
- max_tracer_type_len = len;
- }
- out:
+out:
mutex_unlock(&trace_types_lock);
}
@@ -2610,7 +2593,7 @@ static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[max_tracer_type_len+2];
+ char buf[MAX_TRACER_SIZE+2];
int r;
mutex_lock(&trace_types_lock);
@@ -2760,15 +2743,15 @@ static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- char buf[max_tracer_type_len+1];
+ char buf[MAX_TRACER_SIZE+1];
int i;
size_t ret;
int err;
ret = cnt;
- if (cnt > max_tracer_type_len)
- cnt = max_tracer_type_len;
+ if (cnt > MAX_TRACER_SIZE)
+ cnt = MAX_TRACER_SIZE;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 86bcff94791a..405cb850b75d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,7 +11,6 @@
#include <linux/ftrace.h>
#include <trace/boot.h>
#include <linux/kmemtrace.h>
-#include <trace/power.h>
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
@@ -37,7 +36,6 @@ enum trace_type {
TRACE_HW_BRANCHES,
TRACE_KMEM_ALLOC,
TRACE_KMEM_FREE,
- TRACE_POWER,
TRACE_BLK,
__TRACE_LAST_TYPE,
@@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
- IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index a431748ddd6e..ead3d724599d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
F_printk("from: %llx to: %llx", __entry->from, __entry->to)
);
-FTRACE_ENTRY(power, trace_power,
-
- TRACE_POWER,
-
- F_STRUCT(
- __field_struct( struct power_trace, state_data )
- __field_desc( s64, state_data, stamp )
- __field_desc( s64, state_data, end )
- __field_desc( int, state_data, type )
- __field_desc( int, state_data, state )
- ),
-
- F_printk("%llx->%llx type:%u state:%u",
- __entry->stamp, __entry->end,
- __entry->type, __entry->state)
-);
-
FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 55a25c933d15..dd44b8768867 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,6 +8,57 @@
#include <linux/module.h>
#include "trace.h"
+/*
+ * We can't use a size but a type in alloc_percpu()
+ * So let's create a dummy type that matches the desired size
+ */
+typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
+
+char *trace_profile_buf;
+EXPORT_SYMBOL_GPL(trace_profile_buf);
+
+char *trace_profile_buf_nmi;
+EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
+
+/* Count the events in use (per event id, not per instance) */
+static int total_profile_count;
+
+static int ftrace_profile_enable_event(struct ftrace_event_call *event)
+{
+ char *buf;
+ int ret = -ENOMEM;
+
+ if (atomic_inc_return(&event->profile_count))
+ return 0;
+
+ if (!total_profile_count++) {
+ buf = (char *)alloc_percpu(profile_buf_t);
+ if (!buf)
+ goto fail_buf;
+
+ rcu_assign_pointer(trace_profile_buf, buf);
+
+ buf = (char *)alloc_percpu(profile_buf_t);
+ if (!buf)
+ goto fail_buf_nmi;
+
+ rcu_assign_pointer(trace_profile_buf_nmi, buf);
+ }
+
+ ret = event->profile_enable();
+ if (!ret)
+ return 0;
+
+ kfree(trace_profile_buf_nmi);
+fail_buf_nmi:
+ kfree(trace_profile_buf);
+fail_buf:
+ total_profile_count--;
+ atomic_dec(&event->profile_count);
+
+ return ret;
+}
+
int ftrace_profile_enable(int event_id)
{
struct ftrace_event_call *event;
@@ -17,7 +68,7 @@ int ftrace_profile_enable(int event_id)
list_for_each_entry(event, &ftrace_events, list) {
if (event->id == event_id && event->profile_enable &&
try_module_get(event->mod)) {
- ret = event->profile_enable(event);
+ ret = ftrace_profile_enable_event(event);
break;
}
}
@@ -26,6 +77,33 @@ int ftrace_profile_enable(int event_id)
return ret;
}
+static void ftrace_profile_disable_event(struct ftrace_event_call *event)
+{
+ char *buf, *nmi_buf;
+
+ if (!atomic_add_negative(-1, &event->profile_count))
+ return;
+
+ event->profile_disable();
+
+ if (!--total_profile_count) {
+ buf = trace_profile_buf;
+ rcu_assign_pointer(trace_profile_buf, NULL);
+
+ nmi_buf = trace_profile_buf_nmi;
+ rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+
+ /*
+ * Ensure every events in profiling have finished before
+ * releasing the buffers
+ */
+ synchronize_sched();
+
+ free_percpu(buf);
+ free_percpu(nmi_buf);
+ }
+}
+
void ftrace_profile_disable(int event_id)
{
struct ftrace_event_call *event;
@@ -33,7 +111,7 @@ void ftrace_profile_disable(int event_id)
mutex_lock(&event_mutex);
list_for_each_entry(event, &ftrace_events, list) {
if (event->id == event_id) {
- event->profile_disable(event);
+ ftrace_profile_disable_event(event);
module_put(event->mod);
break;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 56c260b83a9c..6f03c8a1105e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -271,42 +271,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct list_head *list = m->private;
- struct ftrace_event_call *call;
+ struct ftrace_event_call *call = v;
(*pos)++;
- for (;;) {
- if (list == &ftrace_events)
- return NULL;
-
- call = list_entry(list, struct ftrace_event_call, list);
-
+ list_for_each_entry_continue(call, &ftrace_events, list) {
/*
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
if (call->regfunc)
- break;
-
- list = list->next;
+ return call;
}
- m->private = list->next;
-
- return call;
+ return NULL;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_call *call = NULL;
+ struct ftrace_event_call *call;
loff_t l;
mutex_lock(&event_mutex);
- m->private = ftrace_events.next;
+ call = list_entry(&ftrace_events, struct ftrace_event_call, list);
for (l = 0; l <= *pos; ) {
- call = t_next(m, NULL, &l);
+ call = t_next(m, call, &l);
if (!call)
break;
}
@@ -316,37 +306,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct list_head *list = m->private;
- struct ftrace_event_call *call;
+ struct ftrace_event_call *call = v;
(*pos)++;
- retry:
- if (list == &ftrace_events)
- return NULL;
-
- call = list_entry(list, struct ftrace_event_call, list);
-
- if (!call->enabled) {
- list = list->next;
- goto retry;
+ list_for_each_entry_continue(call, &ftrace_events, list) {
+ if (call->enabled)
+ return call;
}
- m->private = list->next;
-
- return call;
+ return NULL;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_call *call = NULL;
+ struct ftrace_event_call *call;
loff_t l;
mutex_lock(&event_mutex);
- m->private = ftrace_events.next;
+ call = list_entry(&ftrace_events, struct ftrace_event_call, list);
for (l = 0; l <= *pos; ) {
- call = s_next(m, NULL, &l);
+ call = s_next(m, call, &l);
if (!call)
break;
}
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445a..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * ring buffer based C-state tracer
- *
- * Arjan van de Ven <arjan@linux.intel.com>
- * Copyright (C) 2008 Intel Corporation
- *
- * Much is borrowed from trace_boot.c which is
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <trace/power.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-static struct trace_array *power_trace;
-static int __read_mostly trace_power_enabled;
-
-static void probe_power_start(struct power_trace *it, unsigned int type,
- unsigned int level)
-{
- if (!trace_power_enabled)
- return;
-
- memset(it, 0, sizeof(struct power_trace));
- it->state = level;
- it->type = type;
- it->stamp = ktime_get();
-}
-
-
-static void probe_power_end(struct power_trace *it)
-{
- struct ftrace_event_call *call = &event_power;
- struct ring_buffer_event *event;
- struct ring_buffer *buffer;
- struct trace_power *entry;
- struct trace_array_cpu *data;
- struct trace_array *tr = power_trace;
-
- if (!trace_power_enabled)
- return;
-
- buffer = tr->buffer;
-
- preempt_disable();
- it->end = ktime_get();
- data = tr->data[smp_processor_id()];
-
- event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->state_data = *it;
- if (!filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
- preempt_enable();
-}
-
-static void probe_power_mark(struct power_trace *it, unsigned int type,
- unsigned int level)
-{
- struct ftrace_event_call *call = &event_power;
- struct ring_buffer_event *event;
- struct ring_buffer *buffer;
- struct trace_power *entry;
- struct trace_array_cpu *data;
- struct trace_array *tr = power_trace;
-
- if (!trace_power_enabled)
- return;
-
- buffer = tr->buffer;
-
- memset(it, 0, sizeof(struct power_trace));
- it->state = level;
- it->type = type;
- it->stamp = ktime_get();
- preempt_disable();
- it->end = it->stamp;
- data = tr->data[smp_processor_id()];
-
- event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
- sizeof(*entry), 0, 0);
- if (!event)
- goto out;
- entry = ring_buffer_event_data(event);
- entry->state_data = *it;
- if (!filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
- preempt_enable();
-}
-
-static int tracing_power_register(void)
-{
- int ret;
-
- ret = register_trace_power_start(probe_power_start);
- if (ret) {
- pr_info("power trace: Couldn't activate tracepoint"
- " probe to trace_power_start\n");
- return ret;
- }
- ret = register_trace_power_end(probe_power_end);
- if (ret) {
- pr_info("power trace: Couldn't activate tracepoint"
- " probe to trace_power_end\n");
- goto fail_start;
- }
- ret = register_trace_power_mark(probe_power_mark);
- if (ret) {
- pr_info("power trace: Couldn't activate tracepoint"
- " probe to trace_power_mark\n");
- goto fail_end;
- }
- return ret;
-fail_end:
- unregister_trace_power_end(probe_power_end);
-fail_start:
- unregister_trace_power_start(probe_power_start);
- return ret;
-}
-
-static void start_power_trace(struct trace_array *tr)
-{
- trace_power_enabled = 1;
-}
-
-static void stop_power_trace(struct trace_array *tr)
-{
- trace_power_enabled = 0;
-}
-
-static void power_trace_reset(struct trace_array *tr)
-{
- trace_power_enabled = 0;
- unregister_trace_power_start(probe_power_start);
- unregister_trace_power_end(probe_power_end);
- unregister_trace_power_mark(probe_power_mark);
-}
-
-
-static int power_trace_init(struct trace_array *tr)
-{
- power_trace = tr;
-
- trace_power_enabled = 1;
- tracing_power_register();
-
- tracing_reset_online_cpus(tr);
- return 0;
-}
-
-static enum print_line_t power_print_line(struct trace_iterator *iter)
-{
- int ret = 0;
- struct trace_entry *entry = iter->ent;
- struct trace_power *field ;
- struct power_trace *it;
- struct trace_seq *s = &iter->seq;
- struct timespec stamp;
- struct timespec duration;
-
- trace_assign_type(field, entry);
- it = &field->state_data;
- stamp = ktime_to_timespec(it->stamp);
- duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
-
- if (entry->type == TRACE_POWER) {
- if (it->type == POWER_CSTATE)
- ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
- stamp.tv_sec,
- stamp.tv_nsec,
- it->state, iter->cpu,
- duration.tv_sec,
- duration.tv_nsec);
- if (it->type == POWER_PSTATE)
- ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
- stamp.tv_sec,
- stamp.tv_nsec,
- it->state, iter->cpu);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
- }
- return TRACE_TYPE_UNHANDLED;
-}
-
-static void power_print_header(struct seq_file *s)
-{
- seq_puts(s, "# TIMESTAMP STATE EVENT\n");
- seq_puts(s, "# | | |\n");
-}
-
-static struct tracer power_tracer __read_mostly =
-{
- .name = "power",
- .init = power_trace_init,
- .start = start_power_trace,
- .stop = stop_power_trace,
- .reset = power_trace_reset,
- .print_line = power_print_line,
- .print_header = power_print_header,
-};
-
-static int init_power_trace(void)
-{
- return register_tracer(&power_tracer);
-}
-device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
#include <linux/ftrace.h>
#include <linux/string.h>
#include <linux/module.h>
-#include <linux/marker.h>
#include <linux/mutex.h>
#include <linux/ctype.h>
#include <linux/list.h>
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..7a3550cf2597 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
static void prof_syscall_enter(struct pt_regs *regs, long id)
{
- struct syscall_trace_enter *rec;
struct syscall_metadata *sys_data;
+ struct syscall_trace_enter *rec;
+ unsigned long flags;
+ char *raw_data;
int syscall_nr;
int size;
+ int cpu;
syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- do {
- char raw_data[size];
+ if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+ "profile buffer not large enough"))
+ return;
+
+ /* Protect the per cpu buffer, begin the rcu read side */
+ local_irq_save(flags);
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+ cpu = smp_processor_id();
+
+ if (in_nmi())
+ raw_data = rcu_dereference(trace_profile_buf_nmi);
+ else
+ raw_data = rcu_dereference(trace_profile_buf);
+
+ if (!raw_data)
+ goto end;
- rec = (struct syscall_trace_enter *) raw_data;
- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->enter_id;
- rec->nr = syscall_nr;
- syscall_get_arguments(current, regs, 0, sys_data->nb_args,
- (unsigned long *)&rec->args);
- perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
- } while(0);
+ raw_data = per_cpu_ptr(raw_data, cpu);
+
+ /* zero the dead bytes from align to not leak stack to user */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+ rec = (struct syscall_trace_enter *) raw_data;
+ tracing_generic_entry_update(&rec->ent, 0, 0);
+ rec->ent.type = sys_data->enter_id;
+ rec->nr = syscall_nr;
+ syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+ (unsigned long *)&rec->args);
+ perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+
+end:
+ local_irq_restore(flags);
}
int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
static void prof_syscall_exit(struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
- struct syscall_trace_exit rec;
+ struct syscall_trace_exit *rec;
+ unsigned long flags;
int syscall_nr;
+ char *raw_data;
+ int size;
+ int cpu;
syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
if (!sys_data)
return;
- tracing_generic_entry_update(&rec.ent, 0, 0);
- rec.ent.type = sys_data->exit_id;
- rec.nr = syscall_nr;
- rec.ret = syscall_get_return_value(current, regs);
+ /* We can probably do that at build time */
+ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
+ size -= sizeof(u32);
- perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+ /*
+ * Impossible, but be paranoid with the future
+ * How to put this check outside runtime?
+ */
+ if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+ "exit event has grown above profile buffer size"))
+ return;
+
+ /* Protect the per cpu buffer, begin the rcu read side */
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+
+ if (in_nmi())
+ raw_data = rcu_dereference(trace_profile_buf_nmi);
+ else
+ raw_data = rcu_dereference(trace_profile_buf);
+
+ if (!raw_data)
+ goto end;
+
+ raw_data = per_cpu_ptr(raw_data, cpu);
+
+ /* zero the dead bytes from align to not leak stack to user */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+ rec = (struct syscall_trace_exit *)raw_data;
+
+ tracing_generic_entry_update(&rec->ent, 0, 0);
+ rec->ent.type = sys_data->exit_id;
+ rec->nr = syscall_nr;
+ rec->ret = syscall_get_return_value(current, regs);
+
+ perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size);
+
+end:
+ local_irq_restore(flags);
}
int reg_prof_syscall_exit(char *name)