summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c693
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c47
-rw-r--r--kernel/hw_breakpoint.c11
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/lockdep.c10
-rw-r--r--kernel/nsproxy.c13
-rw-r--r--kernel/params.c6
-rw-r--r--kernel/perf_event.c118
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/rcutree.h21
-rw-r--r--kernel/rcutree_plugin.h8
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/sched_cpupri.c2
-rw-r--r--kernel/sched_fair.c2
-rw-r--r--kernel/sched_rt.c7
-rw-r--r--kernel/sys.c67
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c30
-rw-r--r--kernel/trace/ring_buffer.c16
-rw-r--r--kernel/trace/trace.c49
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_clock.c1
-rw-r--r--kernel/trace/trace_event_perf.c (renamed from kernel/trace/trace_event_profile.c)54
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_functions_graph.c27
-rw-r--r--kernel/trace/trace_kprobe.c29
-rw-r--r--kernel/trace/trace_syscalls.c72
37 files changed, 1027 insertions, 347 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..24f8c81fc48d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -588,16 +588,6 @@ out:
}
/**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
- memset(pacct, 0, sizeof(struct pacct_struct));
- pacct->ac_utime = pacct->ac_stime = cputime_zero;
-}
-
-/**
* acct_collect - collect accounting information into pacct_struct
* @exitcode: task exit code
* @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..78f7f86aa238 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
skb_get(skb);
err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
if (err < 0) {
- BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+ BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
audit_log_lost("auditd dissapeared\n");
audit_pid = 0;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4fd90e129772..ef909a329750 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
+ * Notifications support
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
@@ -44,6 +48,7 @@
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
+#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/hash.h>
@@ -52,15 +57,21 @@
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/eventfd.h>
+#include <linux/poll.h>
#include <asm/atomic.h>
static DEFINE_MUTEX(cgroup_mutex);
-/* Generate an array of cgroup subsystem pointers */
+/*
+ * Generate an array of cgroup subsystem pointers. At boot time, this is
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * registered after that. The mutable section of this array is protected by
+ * cgroup_mutex.
+ */
#define SUBSYS(_x) &_x ## _subsys,
-
-static struct cgroup_subsys *subsys[] = {
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
#include <linux/cgroup_subsys.h>
};
@@ -147,6 +158,35 @@ struct css_id {
unsigned short stack[0]; /* Array of Length (depth+1) */
};
+/*
+ * cgroup_event represents events which userspace want to recieve.
+ */
+struct cgroup_event {
+ /*
+ * Cgroup which the event belongs to.
+ */
+ struct cgroup *cgrp;
+ /*
+ * Control file which the event associated.
+ */
+ struct cftype *cft;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_t wait;
+ struct work_struct remove;
+};
/* The list of hierarchy roots */
@@ -250,7 +290,8 @@ struct cg_cgroup_link {
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
-static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+ struct cgroup_subsys_state *css);
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -448,8 +489,11 @@ static struct css_set *find_existing_css_set(
struct hlist_node *node;
struct css_set *cg;
- /* Built the set of subsystem state objects that we want to
- * see in the new css_set */
+ /*
+ * Build the set of subsystem state objects that we want to see in the
+ * new css_set. while subsystems can change globally, the entries here
+ * won't change, so no need for locking.
+ */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
if (root->subsys_bits & (1UL << i)) {
/* Subsystem is in this hierarchy. So we want
@@ -696,6 +740,7 @@ void cgroup_lock(void)
{
mutex_lock(&cgroup_mutex);
}
+EXPORT_SYMBOL_GPL(cgroup_lock);
/**
* cgroup_unlock - release lock on cgroup changes
@@ -706,6 +751,7 @@ void cgroup_unlock(void)
{
mutex_unlock(&cgroup_mutex);
}
+EXPORT_SYMBOL_GPL(cgroup_unlock);
/*
* A couple of forward declarations required, due to cyclic reference loop:
@@ -757,6 +803,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
if (ret)
break;
}
+
return ret;
}
@@ -884,7 +931,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
css_put(css);
}
-
+/*
+ * Call with cgroup_mutex held. Drops reference counts on modules, including
+ * any duplicate ones that parse_cgroupfs_options took. If this function
+ * returns an error, no reference counts are touched.
+ */
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
@@ -892,6 +943,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
struct cgroup *cgrp = &root->top_cgroup;
int i;
+ BUG_ON(!mutex_is_locked(&cgroup_mutex));
+
removed_bits = root->actual_subsys_bits & ~final_bits;
added_bits = final_bits & ~root->actual_subsys_bits;
/* Check that any added subsystems are currently free */
@@ -900,6 +953,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
struct cgroup_subsys *ss = subsys[i];
if (!(bit & added_bits))
continue;
+ /*
+ * Nobody should tell us to do a subsys that doesn't exist:
+ * parse_cgroupfs_options should catch that case and refcounts
+ * ensure that subsystems won't disappear once selected.
+ */
+ BUG_ON(ss == NULL);
if (ss->root != &rootnode) {
/* Subsystem isn't free */
return -EBUSY;
@@ -919,6 +978,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long bit = 1UL << i;
if (bit & added_bits) {
/* We're binding this subsystem to this hierarchy */
+ BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i]);
BUG_ON(!dummytop->subsys[i]);
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -930,8 +990,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
if (ss->bind)
ss->bind(ss, cgrp);
mutex_unlock(&ss->hierarchy_mutex);
+ /* refcount was already taken, and we're keeping it */
} else if (bit & removed_bits) {
/* We're removing this subsystem */
+ BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
mutex_lock(&ss->hierarchy_mutex);
@@ -942,9 +1004,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
subsys[i]->root = &rootnode;
list_move(&ss->sibling, &rootnode.subsys_list);
mutex_unlock(&ss->hierarchy_mutex);
+ /* subsystem is now free - drop reference on module */
+ module_put(ss->module);
} else if (bit & final_bits) {
/* Subsystem state should already exist */
+ BUG_ON(ss == NULL);
BUG_ON(!cgrp->subsys[i]);
+ /*
+ * a refcount was taken, but we already had one, so
+ * drop the extra reference.
+ */
+ module_put(ss->module);
+#ifdef CONFIG_MODULE_UNLOAD
+ BUG_ON(ss->module && !module_refcount(ss->module));
+#endif
} else {
/* Subsystem state shouldn't exist */
BUG_ON(cgrp->subsys[i]);
@@ -986,13 +1059,20 @@ struct cgroup_sb_opts {
};
-/* Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. */
-static int parse_cgroupfs_options(char *data,
- struct cgroup_sb_opts *opts)
+/*
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
+ * with cgroup_mutex held to protect the subsys[] array. This function takes
+ * refcounts on subsystems to be used, unless it returns error, in which case
+ * no refcounts are taken.
+ */
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data ?: "all";
unsigned long mask = (unsigned long)-1;
+ int i;
+ bool module_pin_failed = false;
+
+ BUG_ON(!mutex_is_locked(&cgroup_mutex));
#ifdef CONFIG_CPUSETS
mask = ~(1UL << cpuset_subsys_id);
@@ -1005,10 +1085,11 @@ static int parse_cgroupfs_options(char *data,
return -EINVAL;
if (!strcmp(token, "all")) {
/* Add all non-disabled subsystems */
- int i;
opts->subsys_bits = 0;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
if (!ss->disabled)
opts->subsys_bits |= 1ul << i;
}
@@ -1026,7 +1107,6 @@ static int parse_cgroupfs_options(char *data,
if (!opts->release_agent)
return -ENOMEM;
} else if (!strncmp(token, "name=", 5)) {
- int i;
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
@@ -1050,9 +1130,10 @@ static int parse_cgroupfs_options(char *data,
return -ENOMEM;
} else {
struct cgroup_subsys *ss;
- int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
ss = subsys[i];
+ if (ss == NULL)
+ continue;
if (!strcmp(token, ss->name)) {
if (!ss->disabled)
set_bit(i, &opts->subsys_bits);
@@ -1087,9 +1168,54 @@ static int parse_cgroupfs_options(char *data,
if (!opts->subsys_bits && !opts->name)
return -EINVAL;
+ /*
+ * Grab references on all the modules we'll need, so the subsystems
+ * don't dance around before rebind_subsystems attaches them. This may
+ * take duplicate reference counts on a subsystem that's already used,
+ * but rebind_subsystems handles this case.
+ */
+ for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+ unsigned long bit = 1UL << i;
+
+ if (!(bit & opts->subsys_bits))
+ continue;
+ if (!try_module_get(subsys[i]->module)) {
+ module_pin_failed = true;
+ break;
+ }
+ }
+ if (module_pin_failed) {
+ /*
+ * oops, one of the modules was going away. this means that we
+ * raced with a module_delete call, and to the user this is
+ * essentially a "subsystem doesn't exist" case.
+ */
+ for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+ /* drop refcounts only on the ones we took */
+ unsigned long bit = 1UL << i;
+
+ if (!(bit & opts->subsys_bits))
+ continue;
+ module_put(subsys[i]->module);
+ }
+ return -ENOENT;
+ }
+
return 0;
}
+static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+{
+ int i;
+ for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+ unsigned long bit = 1UL << i;
+
+ if (!(bit & subsys_bits))
+ continue;
+ module_put(subsys[i]->module);
+ }
+}
+
static int cgroup_remount(struct super_block *sb, int *flags, char *data)
{
int ret = 0;
@@ -1106,21 +1232,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto out_unlock;
- /* Don't allow flags to change at remount */
- if (opts.flags != root->flags) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* Don't allow name to change at remount */
- if (opts.name && strcmp(opts.name, root->name)) {
+ /* Don't allow flags or name to change at remount */
+ if (opts.flags != root->flags ||
+ (opts.name && strcmp(opts.name, root->name))) {
ret = -EINVAL;
+ drop_parsed_module_refcounts(opts.subsys_bits);
goto out_unlock;
}
ret = rebind_subsystems(root, opts.subsys_bits);
- if (ret)
+ if (ret) {
+ drop_parsed_module_refcounts(opts.subsys_bits);
goto out_unlock;
+ }
/* (re)populate subsystem files */
cgroup_populate_dir(cgrp);
@@ -1151,6 +1275,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
+ INIT_LIST_HEAD(&cgrp->event_list);
+ spin_lock_init(&cgrp->event_list_lock);
}
static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1306,7 +1432,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
struct cgroupfs_root *new_root;
/* First find the desired set of subsystems */
+ mutex_lock(&cgroup_mutex);
ret = parse_cgroupfs_options(data, &opts);
+ mutex_unlock(&cgroup_mutex);
if (ret)
goto out_err;
@@ -1317,7 +1445,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
new_root = cgroup_root_from_opts(&opts);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
- goto out_err;
+ goto drop_modules;
}
opts.new_root = new_root;
@@ -1326,7 +1454,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
cgroup_drop_root(opts.new_root);
- goto out_err;
+ goto drop_modules;
}
root = sb->s_fs_info;
@@ -1382,6 +1510,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
free_cg_links(&tmp_cg_links);
goto drop_new_super;
}
+ /*
+ * There must be no failure case after here, since rebinding
+ * takes care of subsystems' refcounts, which are explicitly
+ * dropped in the failure exit path.
+ */
/* EBUSY should be the only error here */
BUG_ON(ret);
@@ -1420,6 +1553,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
* any) is not needed
*/
cgroup_drop_root(opts.new_root);
+ /* no subsys rebinding, so refcounts don't change */
+ drop_parsed_module_refcounts(opts.subsys_bits);
}
simple_set_mnt(mnt, sb);
@@ -1429,6 +1564,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
drop_new_super:
deactivate_locked_super(sb);
+ drop_modules:
+ drop_parsed_module_refcounts(opts.subsys_bits);
out_err:
kfree(opts.release_agent);
kfree(opts.name);
@@ -1542,6 +1679,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
memmove(buf, start, buf + buflen - start);
return 0;
}
+EXPORT_SYMBOL_GPL(cgroup_path);
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1554,7 +1692,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
int retval = 0;
- struct cgroup_subsys *ss;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
struct css_set *cg;
struct css_set *newcg;
@@ -1568,8 +1706,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
retval = ss->can_attach(ss, cgrp, tsk, false);
- if (retval)
- return retval;
+ if (retval) {
+ /*
+ * Remember on which subsystem the can_attach()
+ * failed, so that we only call cancel_attach()
+ * against the subsystems whose can_attach()
+ * succeeded. (See below)
+ */
+ failed_ss = ss;
+ goto out;
+ }
}
}
@@ -1583,14 +1729,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
*/
newcg = find_css_set(cg, cgrp);
put_css_set(cg);
- if (!newcg)
- return -ENOMEM;
+ if (!newcg) {
+ retval = -ENOMEM;
+ goto out;
+ }
task_lock(tsk);
if (tsk->flags & PF_EXITING) {
task_unlock(tsk);
put_css_set(newcg);
- return -ESRCH;
+ retval = -ESRCH;
+ goto out;
}
rcu_assign_pointer(tsk->cgroups, newcg);
task_unlock(tsk);
@@ -1616,7 +1765,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
* is no longer empty.
*/
cgroup_wakeup_rmdir_waiter(cgrp);
- return 0;
+out:
+ if (retval) {
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss)
+ /*
+ * This subsystem was the one that failed the
+ * can_attach() check earlier, so we don't need
+ * to call cancel_attach() against it or any
+ * remaining subsystems.
+ */
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, tsk, false);
+ }
+ }
+ return retval;
}
/*
@@ -1682,6 +1846,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
}
return true;
}
+EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
@@ -1950,6 +2115,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
.rename = cgroup_rename,
};
+/*
+ * Check if a file is a control file
+ */
+static inline struct cftype *__file_cft(struct file *file)
+{
+ if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+ return ERR_PTR(-EINVAL);
+ return __d_cft(file->f_dentry);
+}
+
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
struct super_block *sb)
{
@@ -2069,6 +2244,7 @@ int cgroup_add_file(struct cgroup *cgrp,
error = PTR_ERR(dentry);
return error;
}
+EXPORT_SYMBOL_GPL(cgroup_add_file);
int cgroup_add_files(struct cgroup *cgrp,
struct cgroup_subsys *subsys,
@@ -2083,6 +2259,7 @@ int cgroup_add_files(struct cgroup *cgrp,
}
return 0;
}
+EXPORT_SYMBOL_GPL(cgroup_add_files);
/**
* cgroup_task_count - count the number of tasks in a cgroup.
@@ -2468,7 +2645,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+ struct pid_namespace *ns = current->nsproxy->pid_ns;
+
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
* the last ref-holder is trying to remove l from the list at the same
@@ -2478,8 +2656,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links) {
if (l->key.type == type && l->key.ns == ns) {
- /* found a matching list - drop the extra refcount */
- put_pid_ns(ns);
/* make sure l doesn't vanish out from under us */
down_write(&l->mutex);
mutex_unlock(&cgrp->pidlist_mutex);
@@ -2490,13 +2666,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
if (!l) {
mutex_unlock(&cgrp->pidlist_mutex);
- put_pid_ns(ns);
return l;
}
init_rwsem(&l->mutex);
down_write(&l->mutex);
l->key.type = type;
- l->key.ns = ns;
+ l->key.ns = get_pid_ns(ns);
l->use_count = 0; /* don't increment here */
l->list = NULL;
l->owner = cgrp;
@@ -2804,6 +2979,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
}
/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+ struct cgroup_event *event = container_of(work, struct cgroup_event,
+ remove);
+ struct cgroup *cgrp = event->cgrp;
+
+ /* TODO: check return code */
+ event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ dput(cgrp->dentry);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct cgroup_event *event = container_of(wait,
+ struct cgroup_event, wait);
+ struct cgroup *cgrp = event->cgrp;
+ unsigned long flags = (unsigned long)key;
+
+ if (flags & POLLHUP) {
+ remove_wait_queue_locked(event->wqh, &event->wait);
+ spin_lock(&cgrp->event_list_lock);
+ list_del(&event->list);
+ spin_unlock(&cgrp->event_list_lock);
+ /*
+ * We are in atomic context, but cgroup_event_remove() may
+ * sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+
+ return 0;
+}
+
+static void cgroup_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct cgroup_event *event = container_of(pt,
+ struct cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ struct cgroup_event *event = NULL;
+ unsigned int efd, cfd;
+ struct file *efile = NULL;
+ struct file *cfile = NULL;
+ char *endp;
+ int ret;
+
+ efd = simple_strtoul(buffer, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buffer = endp + 1;
+
+ cfd = simple_strtoul(buffer, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buffer = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+ event->cgrp = cgrp;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+ INIT_WORK(&event->remove, cgroup_event_remove);
+
+ efile = eventfd_fget(efd);
+ if (IS_ERR(efile)) {
+ ret = PTR_ERR(efile);
+ goto fail;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto fail;
+ }
+
+ cfile = fget(cfd);
+ if (!cfile) {
+ ret = -EBADF;
+ goto fail;
+ }
+
+ /* the process need read permission on control file */
+ ret = file_permission(cfile, MAY_READ);
+ if (ret < 0)
+ goto fail;
+
+ event->cft = __file_cft(cfile);
+ if (IS_ERR(event->cft)) {
+ ret = PTR_ERR(event->cft);
+ goto fail;
+ }
+
+ if (!event->cft->register_event || !event->cft->unregister_event) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ret = event->cft->register_event(cgrp, event->cft,
+ event->eventfd, buffer);
+ if (ret)
+ goto fail;
+
+ if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+ event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+ ret = 0;
+ goto fail;
+ }
+
+ /*
+ * Events should be removed after rmdir of cgroup directory, but before
+ * destroying subsystem state objects. Let's take reference to cgroup
+ * directory dentry to do that.
+ */
+ dget(cgrp->dentry);
+
+ spin_lock(&cgrp->event_list_lock);
+ list_add(&event->list, &cgrp->event_list);
+ spin_unlock(&cgrp->event_list_lock);
+
+ fput(cfile);
+ fput(efile);
+
+ return 0;
+
+fail:
+ if (cfile)
+ fput(cfile);
+
+ if (event && event->eventfd && !IS_ERR(event->eventfd))
+ eventfd_ctx_put(event->eventfd);
+
+ if (!IS_ERR_OR_NULL(efile))
+ fput(efile);
+
+ kfree(event);
+
+ return ret;
+}
+
+/*
* for the common functions, 'private' gives the type of file
*/
/* for hysterical raisins, we can't put this on the older files */
@@ -2828,6 +3171,11 @@ static struct cftype files[] = {
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
+ {
+ .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+ .write_string = cgroup_write_event_control,
+ .mode = S_IWUGO,
+ },
};
static struct cftype cft_release_agent = {
@@ -2892,8 +3240,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
/* We need to take each hierarchy_mutex in a consistent order */
int i;
+ /*
+ * No worry about a race with rebind_subsystems that might mess up the
+ * locking order, since both parties are under cgroup_mutex.
+ */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
if (ss->root == root)
mutex_lock(&ss->hierarchy_mutex);
}
@@ -2905,6 +3259,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
if (ss->root == root)
mutex_unlock(&ss->hierarchy_mutex);
}
@@ -3028,11 +3384,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
* synchronization other than RCU, and the subsystem linked
* list isn't RCU-safe */
int i;
+ /*
+ * We won't need to lock the subsys array, because the subsystems
+ * we're concerned about aren't going anywhere since our cgroup root
+ * has a reference on them.
+ */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
struct cgroup_subsys_state *css;
- /* Skip subsystems not in this hierarchy */
- if (ss->root != cgrp->root)
+ /* Skip subsystems not present or not in this hierarchy */
+ if (ss == NULL || ss->root != cgrp->root)
continue;
css = cgrp->subsys[ss->subsys_id];
/* When called from check_for_release() it's possible
@@ -3106,6 +3467,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct dentry *d;
struct cgroup *parent;
DEFINE_WAIT(wait);
+ struct cgroup_event *event, *tmp;
int ret;
/* the vfs holds both inode->i_mutex already */
@@ -3189,6 +3551,20 @@ again:
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace
+ */
+ spin_lock(&cgrp->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+ list_del(&event->list);
+ remove_wait_queue(event->wqh, &event->wait);
+ eventfd_signal(event->eventfd, 1);
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&cgrp->event_list_lock);
+
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -3223,7 +3599,196 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
mutex_init(&ss->hierarchy_mutex);
lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ss->active = 1;
+
+ /* this function shouldn't be used with modular subsystems, since they
+ * need to register a subsys_id, among other things */
+ BUG_ON(ss->module);
+}
+
+/**
+ * cgroup_load_subsys: load and register a modular subsystem at runtime
+ * @ss: the subsystem to load
+ *
+ * This function should be called in a modular subsystem's initcall. If the
+ * subsytem is built as a module, it will be assigned a new subsys_id and set
+ * up for use. If the subsystem is built-in anyway, work is delegated to the
+ * simpler cgroup_init_subsys.
+ */
+int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+{
+ int i;
+ struct cgroup_subsys_state *css;
+
+ /* check name and function validity */
+ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
+ ss->create == NULL || ss->destroy == NULL)
+ return -EINVAL;
+
+ /*
+ * we don't support callbacks in modular subsystems. this check is
+ * before the ss->module check for consistency; a subsystem that could
+ * be a module should still have no callbacks even if the user isn't
+ * compiling it as one.
+ */
+ if (ss->fork || ss->exit)
+ return -EINVAL;
+
+ /*
+ * an optionally modular subsystem is built-in: we want to do nothing,
+ * since cgroup_init_subsys will have already taken care of it.
+ */
+ if (ss->module == NULL) {
+ /* a few sanity checks */
+ BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+ BUG_ON(subsys[ss->subsys_id] != ss);
+ return 0;
+ }
+
+ /*
+ * need to register a subsys id before anything else - for example,
+ * init_cgroup_css needs it.
+ */
+ mutex_lock(&cgroup_mutex);
+ /* find the first empty slot in the array */
+ for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+ if (subsys[i] == NULL)
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT) {
+ /* maximum number of subsystems already registered! */
+ mutex_unlock(&cgroup_mutex);
+ return -EBUSY;
+ }
+ /* assign ourselves the subsys_id */
+ ss->subsys_id = i;
+ subsys[i] = ss;
+
+ /*
+ * no ss->create seems to need anything important in the ss struct, so
+ * this can happen first (i.e. before the rootnode attachment).
+ */
+ css = ss->create(ss, dummytop);
+ if (IS_ERR(css)) {
+ /* failure case - need to deassign the subsys[] slot. */
+ subsys[i] = NULL;
+ mutex_unlock(&cgroup_mutex);
+ return PTR_ERR(css);
+ }
+
+ list_add(&ss->sibling, &rootnode.subsys_list);
+ ss->root = &rootnode;
+
+ /* our new subsystem will be attached to the dummy hierarchy. */
+ init_cgroup_css(css, ss, dummytop);
+ /* init_idr must be after init_cgroup_css because it sets css->id. */
+ if (ss->use_id) {
+ int ret = cgroup_init_idr(ss, css);
+ if (ret) {
+ dummytop->subsys[ss->subsys_id] = NULL;
+ ss->destroy(ss, dummytop);
+ subsys[i] = NULL;
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+ }
+ }
+
+ /*
+ * Now we need to entangle the css into the existing css_sets. unlike
+ * in cgroup_init_subsys, there are now multiple css_sets, so each one
+ * will need a new pointer to it; done by iterating the css_set_table.
+ * furthermore, modifying the existing css_sets will corrupt the hash
+ * table state, so each changed css_set will need its hash recomputed.
+ * this is all done under the css_set_lock.
+ */
+ write_lock(&css_set_lock);
+ for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+ struct css_set *cg;
+ struct hlist_node *node, *tmp;
+ struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+
+ hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+ /* skip entries that we already rehashed */
+ if (cg->subsys[ss->subsys_id])
+ continue;
+ /* remove existing entry */
+ hlist_del(&cg->hlist);
+ /* set new value */
+ cg->subsys[ss->subsys_id] = css;
+ /* recompute hash and restore entry */
+ new_bucket = css_set_hash(cg->subsys);
+ hlist_add_head(&cg->hlist, new_bucket);
+ }
+ }
+ write_unlock(&css_set_lock);
+
+ mutex_init(&ss->hierarchy_mutex);
+ lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
+ ss->active = 1;
+
+ /* success! */
+ mutex_unlock(&cgroup_mutex);
+ return 0;
}
+EXPORT_SYMBOL_GPL(cgroup_load_subsys);
+
+/**
+ * cgroup_unload_subsys: unload a modular subsystem
+ * @ss: the subsystem to unload
+ *
+ * This function should be called in a modular subsystem's exitcall. When this
+ * function is invoked, the refcount on the subsystem's module will be 0, so
+ * the subsystem will not be attached to any hierarchy.
+ */
+void cgroup_unload_subsys(struct cgroup_subsys *ss)
+{
+ struct cg_cgroup_link *link;
+ struct hlist_head *hhead;
+
+ BUG_ON(ss->module == NULL);
+
+ /*
+ * we shouldn't be called if the subsystem is in use, and the use of
+ * try_module_get in parse_cgroupfs_options should ensure that it
+ * doesn't start being used while we're killing it off.
+ */
+ BUG_ON(ss->root != &rootnode);
+
+ mutex_lock(&cgroup_mutex);
+ /* deassign the subsys_id */
+ BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
+ subsys[ss->subsys_id] = NULL;
+
+ /* remove subsystem from rootnode's list of subsystems */
+ list_del(&ss->sibling);
+
+ /*
+ * disentangle the css from all css_sets attached to the dummytop. as
+ * in loading, we need to pay our respects to the hashtable gods.
+ */
+ write_lock(&css_set_lock);
+ list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+ struct css_set *cg = link->cg;
+
+ hlist_del(&cg->hlist);
+ BUG_ON(!cg->subsys[ss->subsys_id]);
+ cg->subsys[ss->subsys_id] = NULL;
+ hhead = css_set_hash(cg->subsys);
+ hlist_add_head(&cg->hlist, hhead);
+ }
+ write_unlock(&css_set_lock);
+
+ /*
+ * remove subsystem's css from the dummytop and free it - need to free
+ * before marking as null because ss->destroy needs the cgrp->subsys
+ * pointer to find their state. note that this also takes care of
+ * freeing the css_id.
+ */
+ ss->destroy(ss, dummytop);
+ dummytop->subsys[ss->subsys_id] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+}
+EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
/**
* cgroup_init_early - cgroup initialization at system boot
@@ -3253,7 +3818,8 @@ int __init cgroup_init_early(void)
for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&css_set_table[i]);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /* at bootup time, we don't worry about modular subsystems */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
BUG_ON(!ss->name);
@@ -3288,12 +3854,13 @@ int __init cgroup_init(void)
if (err)
return err;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /* at bootup time, we don't worry about modular subsystems */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (!ss->early_init)
cgroup_init_subsys(ss);
if (ss->use_id)
- cgroup_subsys_init_idr(ss);
+ cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
}
/* Add init_css_set to the hash table */
@@ -3397,9 +3964,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /*
+ * ideally we don't want subsystems moving around while we do this.
+ * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+ * subsys/hierarchy state.
+ */
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
ss->root->number_of_cgroups, !ss->disabled);
@@ -3457,7 +4031,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
{
if (need_forkexit_callback) {
int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /*
+ * forkexit callbacks are only supported for builtin
+ * subsystems, and the builtin section of the subsys array is
+ * immutable, so we don't need to lock the subsys array here.
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->fork)
ss->fork(ss, child);
@@ -3526,7 +4105,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
struct css_set *cg;
if (run_callbacks && need_forkexit_callback) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /*
+ * modular subsystems can't use callbacks, so no need to lock
+ * the subsys array
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->exit)
ss->exit(ss, tsk);
@@ -3720,12 +4303,13 @@ static void check_for_release(struct cgroup *cgrp)
}
}
-void __css_put(struct cgroup_subsys_state *css)
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css, int count)
{
struct cgroup *cgrp = css->cgroup;
int val;
rcu_read_lock();
- val = atomic_dec_return(&css->refcnt);
+ val = atomic_sub_return(count, &css->refcnt);
if (val == 1) {
if (notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3736,6 +4320,7 @@ void __css_put(struct cgroup_subsys_state *css)
rcu_read_unlock();
WARN_ON_ONCE(val < 1);
}
+EXPORT_SYMBOL_GPL(__css_put);
/*
* Notify userspace when a cgroup is released, by running the
@@ -3817,8 +4402,11 @@ static int __init cgroup_disable(char *str)
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /*
+ * cgroup_disable, being at boot time, can't know about module
+ * subsystems, so we don't worry about them.
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (!strcmp(token, ss->name)) {
@@ -3848,6 +4436,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
return cssid->id;
return 0;
}
+EXPORT_SYMBOL_GPL(css_id);
unsigned short css_depth(struct cgroup_subsys_state *css)
{
@@ -3857,6 +4446,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
return cssid->depth;
return 0;
}
+EXPORT_SYMBOL_GPL(css_depth);
bool css_is_ancestor(struct cgroup_subsys_state *child,
const struct cgroup_subsys_state *root)
@@ -3893,6 +4483,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
spin_unlock(&ss->id_lock);
call_rcu(&id->rcu_head, __free_css_id_cb);
}
+EXPORT_SYMBOL_GPL(free_css_id);
/*
* This is called by init or create(). Then, calls to this function are
@@ -3942,15 +4533,14 @@ err_out:
}
-static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
+ struct cgroup_subsys_state *rootcss)
{
struct css_id *newid;
- struct cgroup_subsys_state *rootcss;
spin_lock_init(&ss->id_lock);
idr_init(&ss->idr);
- rootcss = init_css_set.subsys[ss->subsys_id];
newid = get_new_cssid(ss, 0);
if (IS_ERR(newid))
return PTR_ERR(newid);
@@ -4010,6 +4600,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
return rcu_dereference(cssid->css);
}
+EXPORT_SYMBOL_GPL(css_lookup);
/**
* css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/exit.c b/kernel/exit.c
index ce1e48c2d93d..cce59cb5ee6a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk)
sighand = rcu_dereference_check(tsk->sighand,
rcu_read_lock_held() ||
- lockdep_is_held(&tasklist_lock));
+ lockdep_tasklist_lock_is_held());
spin_lock(&sighand->siglock);
posix_cpu_timers_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b0ec34abc0bb..4799c5f0e6d0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
-EXPORT_SYMBOL_GPL(tasklist_lock);
+
+#ifdef CONFIG_PROVE_RCU
+int lockdep_tasklist_lock_is_held(void)
+{
+ return lockdep_is_held(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
+#endif /* #ifdef CONFIG_PROVE_RCU */
int nr_processes(void)
{
@@ -833,17 +840,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
/* Thread group counters. */
thread_group_cputime_init(sig);
- /* Expiration times and increments. */
- sig->it[CPUCLOCK_PROF].expires = cputime_zero;
- sig->it[CPUCLOCK_PROF].incr = cputime_zero;
- sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
- sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
-
- /* Cached expiration times. */
- sig->cputime_expires.prof_exp = cputime_zero;
- sig->cputime_expires.virt_exp = cputime_zero;
- sig->cputime_expires.sched_exp = 0;
-
cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
@@ -863,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
if (clone_flags & CLONE_THREAD)
return 0;
- sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+ sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
if (!sig)
return -ENOMEM;
@@ -871,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
atomic_set(&sig->count, 1);
atomic_set(&sig->live, 1);
init_waitqueue_head(&sig->wait_chldexit);
- sig->flags = 0;
if (clone_flags & CLONE_NEWPID)
sig->flags |= SIGNAL_UNKILLABLE;
- sig->group_exit_code = 0;
- sig->group_exit_task = NULL;
- sig->group_stop_count = 0;
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- sig->it_real_incr.tv64 = 0;
sig->real_timer.function = it_real_fn;
- sig->leader = 0; /* session leadership doesn't inherit */
- sig->tty_old_pgrp = NULL;
- sig->tty = NULL;
-
- sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
- sig->gtime = cputime_zero;
- sig->cgtime = cputime_zero;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- sig->prev_utime = sig->prev_stime = cputime_zero;
-#endif
- sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
- sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
- sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
- sig->maxrss = sig->cmaxrss = 0;
- task_io_accounting_init(&sig->ioac);
- sig->sum_sched_runtime = 0;
- taskstats_tgid_init(sig);
-
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
task_unlock(current->group_leader);
posix_cpu_timers_init_group(sig);
- acct_init_pacct(&sig->pacct);
-
tty_audit_fork(sig);
sig->oom_adj = current->signal->oom_adj;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 967e66143e11..03808ed342a6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
*
* @return a set of per_cpu pointers to perf events
*/
-struct perf_event **
+struct perf_event * __percpu *
register_wide_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered)
{
- struct perf_event **cpu_events, **pevent, *bp;
+ struct perf_event * __percpu *cpu_events, **pevent, *bp;
long err;
int cpu;
cpu_events = alloc_percpu(typeof(*cpu_events));
if (!cpu_events)
- return ERR_PTR(-ENOMEM);
+ return (void __percpu __force *)ERR_PTR(-ENOMEM);
get_online_cpus();
for_each_online_cpu(cpu) {
@@ -451,7 +451,7 @@ fail:
put_online_cpus();
free_percpu(cpu_events);
- return ERR_PTR(err);
+ return (void __percpu __force *)ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
* unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
* @cpu_events: the per cpu set of events to unregister
*/
-void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
{
int cpu;
struct perf_event **pevent;
@@ -489,5 +489,4 @@ struct pmu perf_ops_bp = {
.enable = arch_install_hw_breakpoint,
.disable = arch_uninstall_hw_breakpoint,
.read = hw_breakpoint_pmu_read,
- .unthrottle = hw_breakpoint_pmu_unthrottle
};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d70394f12ee9..42ec11b2af8a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -554,7 +554,7 @@ out:
* signal. The occurence is latched into the irq controller hardware
* and must be acked in order to be reenabled. After the ack another
* interrupt can happen on the same source even before the first one
- * is handled by the assosiacted event handler. If this happens it
+ * is handled by the associated event handler. If this happens it
* might be necessary to disable (mask) the interrupt depending on the
* controller hardware. This requires to reenable the interrupt inside
* of the loop which handles the interrupts which have arrived while
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
* automatically freed on driver detach.
*
* If an IRQ allocated with this function needs to be freed
- * separately, dev_free_irq() must be used.
+ * separately, devm_free_irq() must be used.
*/
int devm_request_threaded_irq(struct device *dev, unsigned int irq,
irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as free_irq().
* This function instead of free_irq() should be used to manually
- * free IRQs allocated with dev_request_irq().
+ * free IRQs allocated with devm_request_irq().
*/
void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fa034d29cf73..0ed46f3e51e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -259,7 +259,8 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
struct kprobe_insn_page *kip;
list_for_each_entry(kip, &c->pages, list) {
- long idx = ((long)slot - (long)kip->insns) / c->insn_size;
+ long idx = ((long)slot - (long)kip->insns) /
+ (c->insn_size * sizeof(kprobe_opcode_t));
if (idx >= 0 && idx < slots_per_page(c)) {
WARN_ON(kip->slot_used[idx] != SLOT_USED);
if (dirty) {
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6b1ccc3f0205..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(uevent_seqnum);
-/* uevent helper program, used during early boo */
+/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0c30d0455de1..c927a549db2c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3211,8 +3211,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
- trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
-
if (unlikely(current->lockdep_recursion))
return;
@@ -3220,6 +3218,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
check_flags(flags);
current->lockdep_recursion = 1;
+ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
__lock_acquire(lock, subclass, trylock, read, check,
irqs_disabled_flags(flags), nest_lock, ip, 0);
current->lockdep_recursion = 0;
@@ -3232,14 +3231,13 @@ void lock_release(struct lockdep_map *lock, int nested,
{
unsigned long flags;
- trace_lock_release(lock, nested, ip);
-
if (unlikely(current->lockdep_recursion))
return;
raw_local_irq_save(flags);
check_flags(flags);
current->lockdep_recursion = 1;
+ trace_lock_release(lock, nested, ip);
__lock_release(lock, nested, ip);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
@@ -3413,8 +3411,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- trace_lock_contended(lock, ip);
-
if (unlikely(!lock_stat))
return;
@@ -3424,6 +3420,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
raw_local_irq_save(flags);
check_flags(flags);
current->lockdep_recursion = 1;
+ trace_lock_contended(lock, ip);
__lock_contended(lock, ip);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
@@ -3822,6 +3819,7 @@ void lockdep_rcu_dereference(const char *file, const int line)
printk("%s:%d invoked rcu_dereference_check() without protection!\n",
file, line);
printk("\nother info that might help us debug this:\n\n");
+ printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
lockdep_print_held_locks(curr);
printk("\nstack backtrace:\n");
dump_stack();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..2ab67233ee8f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -24,7 +24,18 @@
static struct kmem_cache *nsproxy_cachep;
-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+struct nsproxy init_nsproxy = {
+ .count = ATOMIC_INIT(1),
+ .uts_ns = &init_uts_ns,
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+ .ipc_ns = &init_ipc_ns,
+#endif
+ .mnt_ns = NULL,
+ .pid_ns = &init_pid_ns,
+#ifdef CONFIG_NET
+ .net_ns = &init_net,
+#endif
+};
static inline struct nsproxy *create_nsproxy(void)
{
diff --git a/kernel/params.c b/kernel/params.c
index d55a53ec9234..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -401,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
}
/* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+#define to_module_attr(n) container_of(n, struct module_attribute, attr)
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
extern struct kernel_param __start___param[], __stop___param[];
@@ -420,7 +420,7 @@ struct module_param_attrs
};
#ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
static ssize_t param_attr_show(struct module_attribute *mattr,
struct module *mod, char *buf)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f40560b86544..574ee58a3046 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly;
*/
int sysctl_perf_event_paranoid __read_mostly = 1;
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
- return sysctl_perf_event_paranoid > -1;
-}
-
-static inline bool perf_paranoid_cpu(void)
-{
- return sysctl_perf_event_paranoid > 0;
-}
-
-static inline bool perf_paranoid_kernel(void)
-{
- return sysctl_perf_event_paranoid > 1;
-}
-
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
/*
@@ -96,10 +81,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
void __weak hw_perf_disable(void) { barrier(); }
void __weak hw_perf_enable(void) { barrier(); }
-void __weak hw_perf_event_setup(int cpu) { barrier(); }
-void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
-void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
-
int __weak
hw_perf_group_sched_in(struct perf_event *group_leader,
struct perf_cpu_context *cpuctx,
@@ -112,25 +93,15 @@ void __weak perf_event_print_debug(void) { }
static DEFINE_PER_CPU(int, perf_disable_count);
-void __perf_disable(void)
-{
- __get_cpu_var(perf_disable_count)++;
-}
-
-bool __perf_enable(void)
-{
- return !--__get_cpu_var(perf_disable_count);
-}
-
void perf_disable(void)
{
- __perf_disable();
- hw_perf_disable();
+ if (!__get_cpu_var(perf_disable_count)++)
+ hw_perf_disable();
}
void perf_enable(void)
{
- if (__perf_enable())
+ if (!--__get_cpu_var(perf_disable_count))
hw_perf_enable();
}
@@ -1553,12 +1524,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
*/
if (interrupts == MAX_INTERRUPTS) {
perf_log_throttle(event, 1);
+ perf_disable();
event->pmu->unthrottle(event);
+ perf_enable();
}
if (!event->attr.freq || !event->attr.sample_freq)
continue;
+ perf_disable();
event->pmu->read(event);
now = atomic64_read(&event->count);
delta = now - hwc->freq_count_stamp;
@@ -1566,6 +1540,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
if (delta > 0)
perf_adjust_period(event, TICK_NSEC, delta);
+ perf_enable();
}
raw_spin_unlock(&ctx->lock);
}
@@ -1575,9 +1550,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
*/
static void rotate_ctx(struct perf_event_context *ctx)
{
- if (!ctx->nr_events)
- return;
-
raw_spin_lock(&ctx->lock);
/* Rotate the first entry last of non-pinned groups */
@@ -1590,19 +1562,28 @@ void perf_event_task_tick(struct task_struct *curr)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
+ int rotate = 0;
if (!atomic_read(&nr_events))
return;
cpuctx = &__get_cpu_var(perf_cpu_context);
- ctx = curr->perf_event_ctxp;
+ if (cpuctx->ctx.nr_events &&
+ cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+ rotate = 1;
- perf_disable();
+ ctx = curr->perf_event_ctxp;
+ if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+ rotate = 1;
perf_ctx_adjust_freq(&cpuctx->ctx);
if (ctx)
perf_ctx_adjust_freq(ctx);
+ if (!rotate)
+ return;
+
+ perf_disable();
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (ctx)
task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1614,7 +1595,6 @@ void perf_event_task_tick(struct task_struct *curr)
cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
if (ctx)
task_ctx_sched_in(curr, EVENT_FLEXIBLE);
-
perf_enable();
}
@@ -2806,6 +2786,13 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return NULL;
}
+#ifdef CONFIG_EVENT_TRACING
+__weak
+void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
+{
+}
+#endif
+
/*
* Output
*/
@@ -4123,8 +4110,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
if (rctx < 0)
return;
- data.addr = addr;
- data.raw = NULL;
+ perf_sample_data_init(&data, addr);
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
@@ -4169,11 +4155,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
struct perf_event *event;
u64 period;
- event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+ event = container_of(hrtimer, struct perf_event, hw.hrtimer);
event->pmu->read(event);
- data.addr = 0;
- data.raw = NULL;
+ perf_sample_data_init(&data, 0);
data.period = event->hw.last_period;
regs = get_irq_regs();
/*
@@ -4335,26 +4320,20 @@ static const struct pmu perf_ops_task_clock = {
#ifdef CONFIG_EVENT_TRACING
void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
- int entry_size)
+ int entry_size, struct pt_regs *regs)
{
+ struct perf_sample_data data;
struct perf_raw_record raw = {
.size = entry_size,
.data = record,
};
- struct perf_sample_data data = {
- .addr = addr,
- .raw = &raw,
- };
-
- struct pt_regs *regs = get_irq_regs();
-
- if (!regs)
- regs = task_pt_regs(current);
+ perf_sample_data_init(&data, addr);
+ data.raw = &raw;
/* Trace events already protected against recursion */
do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
- &data, regs);
+ &data, regs);
}
EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -4370,7 +4349,7 @@ static int perf_tp_event_match(struct perf_event *event,
static void tp_perf_event_destroy(struct perf_event *event)
{
- ftrace_profile_disable(event->attr.config);
+ perf_trace_disable(event->attr.config);
}
static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4384,7 +4363,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- if (ftrace_profile_enable(event->attr.config))
+ if (perf_trace_enable(event->attr.config))
return NULL;
event->destroy = tp_perf_event_destroy;
@@ -4463,8 +4442,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
struct perf_sample_data sample;
struct pt_regs *regs = data;
- sample.raw = NULL;
- sample.addr = bp->attr.bp_addr;
+ perf_sample_data_init(&sample, bp->attr.bp_addr);
if (!perf_exclude_event(bp, regs))
perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -5392,18 +5370,26 @@ int perf_event_init_task(struct task_struct *child)
return ret;
}
+static void __init perf_event_init_all_cpus(void)
+{
+ int cpu;
+ struct perf_cpu_context *cpuctx;
+
+ for_each_possible_cpu(cpu) {
+ cpuctx = &per_cpu(perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx, NULL);
+ }
+}
+
static void __cpuinit perf_event_init_cpu(int cpu)
{
struct perf_cpu_context *cpuctx;
cpuctx = &per_cpu(perf_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx, NULL);
spin_lock(&perf_resource_lock);
cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
spin_unlock(&perf_resource_lock);
-
- hw_perf_event_setup(cpu);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -5443,20 +5429,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
perf_event_init_cpu(cpu);
break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- hw_perf_event_setup_online(cpu);
- break;
-
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
perf_event_exit_cpu(cpu);
break;
- case CPU_DEAD:
- hw_perf_event_setup_offline(cpu);
- break;
-
default:
break;
}
@@ -5474,6 +5451,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
void __init perf_event_init(void)
{
+ perf_event_init_all_cpus();
perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
diff --git a/kernel/pid.c b/kernel/pid.c
index 86b296943e5f..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
struct task_struct *result = NULL;
if (pid) {
struct hlist_node *first;
- first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock));
+ first = rcu_dereference_check(pid->tasks[type].first,
+ rcu_read_lock_held() ||
+ lockdep_tasklist_lock_is_held());
if (first)
result = hlist_entry(first, struct task_struct, pids[(type)].node);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..79aac93acf99 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -161,13 +161,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
rcu_read_lock();
/*
- * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
- * any nested-container's init processes don't ignore the
- * signal
+ * Any nested-container's init processes won't ignore the
+ * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
*/
task = pid_task(find_vpid(nr), PIDTYPE_PID);
if (task)
- force_sig(SIGKILL, task);
+ send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
rcu_read_unlock();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1439eb504c22..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -246,12 +246,21 @@ struct rcu_data {
#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
-#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
- /* to take at least one */
- /* scheduling clock irq */
- /* before ratting on them. */
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
+#endif
+
+#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
+ /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+ /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
+ /* to take at least one */
+ /* scheduling clock irq */
+ /* before ratting on them. */
#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 464ad2cdee00..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1010,6 +1010,10 @@ int rcu_needs_cpu(int cpu)
int c = 0;
int thatcpu;
+ /* Check for being in the holdoff period. */
+ if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
+ return rcu_needs_cpu_quick_check(cpu);
+
/* Don't bother unless we are the last non-dyntick-idle CPU. */
for_each_cpu_not(thatcpu, nohz_cpu_mask)
if (thatcpu != cpu) {
@@ -1041,10 +1045,8 @@ int rcu_needs_cpu(int cpu)
}
/* If RCU callbacks are still pending, RCU still needs this CPU. */
- if (c) {
+ if (c)
raise_softirq(RCU_SOFTIRQ);
- per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
- }
return c;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 150b6988de49..9ab3cd7858d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2359,7 +2359,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
{
int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
- struct rq *rq, *orig_rq;
+ struct rq *rq;
if (!sched_feat(SYNC_WAKEUPS))
wake_flags &= ~WF_SYNC;
@@ -2367,7 +2367,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
this_cpu = get_cpu();
smp_wmb();
- rq = orig_rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &flags);
update_rq_clock(rq);
if (!(p->state & state))
goto out;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 82095bf2099f..fccf9fbb0d7b 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -56,7 +56,7 @@ static int convert_prio(int prio)
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
*
* Note: This function returns the recommended CPUs as calculated during the
- * current invokation. By the time the call returns, the CPUs may have in
+ * current invocation. By the time the call returns, the CPUs may have in
* fact changed priorities any number of times. While not ideal, it is not
* an issue of correctness since the normal rebalancer logic will correct
* any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3e1fd96c6cf9..5a5ea2cd924f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3476,7 +3476,7 @@ static void run_rebalance_domains(struct softirq_action *h)
static inline int on_null_domain(int cpu)
{
- return !rcu_dereference(cpu_rq(cpu)->sd);
+ return !rcu_dereference_sched(cpu_rq(cpu)->sd);
}
/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5a6ed1f0990a..b5b920ae2ea7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1146,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
if (next && next->prio < idx)
continue;
list_for_each_entry(rt_se, array->queue + idx, run_list) {
- struct task_struct *p = rt_task_of(rt_se);
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ continue;
+
+ p = rt_task_of(rt_se);
if (pick_rt_task(rq, p, cpu)) {
next = p;
break;
diff --git a/kernel/sys.c b/kernel/sys.c
index 9814e43fb23b..8298878f4f71 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/seccomp.h>
#include <linux/cpu.h>
+#include <linux/personality.h>
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
@@ -1114,6 +1115,15 @@ out:
DECLARE_RWSEM(uts_sem);
+#ifdef COMPAT_UTS_MACHINE
+#define override_architecture(name) \
+ (current->personality == PER_LINUX32 && \
+ copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
+ sizeof(COMPAT_UTS_MACHINE)))
+#else
+#define override_architecture(name) 0
+#endif
+
SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
{
int errno = 0;
@@ -1122,9 +1132,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
if (copy_to_user(name, utsname(), sizeof *name))
errno = -EFAULT;
up_read(&uts_sem);
+
+ if (!errno && override_architecture(name))
+ errno = -EFAULT;
return errno;
}
+#ifdef __ARCH_WANT_SYS_OLD_UNAME
+/*
+ * Old cruft
+ */
+SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
+{
+ int error = 0;
+
+ if (!name)
+ return -EFAULT;
+
+ down_read(&uts_sem);
+ if (copy_to_user(name, utsname(), sizeof(*name)))
+ error = -EFAULT;
+ up_read(&uts_sem);
+
+ if (!error && override_architecture(name))
+ error = -EFAULT;
+ return error;
+}
+
+SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
+{
+ int error;
+
+ if (!name)
+ return -EFAULT;
+ if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+ return -EFAULT;
+
+ down_read(&uts_sem);
+ error = __copy_to_user(&name->sysname, &utsname()->sysname,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+ error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+ error |= __copy_to_user(&name->release, &utsname()->release,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->release + __OLD_UTS_LEN);
+ error |= __copy_to_user(&name->version, &utsname()->version,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->version + __OLD_UTS_LEN);
+ error |= __copy_to_user(&name->machine, &utsname()->machine,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+ up_read(&uts_sem);
+
+ if (!error && override_architecture(name))
+ error = -EFAULT;
+ return error ? -EFAULT : 0;
+}
+#endif
+
SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
{
int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
cond_syscall(sys_setuid16);
cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
+cond_syscall(sys_ipc);
cond_syscall(compat_sys_ipc);
cond_syscall(compat_sys_sysctl);
cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0ef19c614f6d..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
+#include <linux/signal.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
@@ -60,13 +61,23 @@
#include <asm/stacktrace.h>
#include <asm/io.h>
#endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+#ifdef CONFIG_RT_MUTEXES
+#include <linux/rtmutex.h>
+#endif
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+#include <linux/lockdep.h>
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+#include <scsi/sg.h>
+#endif
#if defined(CONFIG_SYSCTL)
/* External variables not in a header file. */
-extern int C_A_D;
-extern int print_fatal_signals;
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern int sysctl_panic_on_oom;
@@ -88,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
#ifndef CONFIG_MMU
extern int sysctl_nr_trim_pages;
#endif
-#ifdef CONFIG_RCU_TORTURE_TEST
-extern int rcutorture_runnable;
-#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
#ifdef CONFIG_BLOCK
extern int blk_iopoll_enabled;
#endif
@@ -120,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
-#ifdef CONFIG_MODULES
-extern char modprobe_path[];
-extern int modules_disabled;
-#endif
-#ifdef CONFIG_CHR_DEV_SG
-extern int sg_big_buff;
-#endif
-
#ifdef CONFIG_SPARC
#include <asm/system.h>
#endif
@@ -149,10 +149,6 @@ extern int sysctl_userprocess_debug;
extern int spin_retry;
#endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-extern int acct_parm[];
-#endif
-
#ifdef CONFIG_IA64
extern int no_unaligned_warning;
extern int unaligned_dump_stack;
@@ -160,10 +156,6 @@ extern int unaligned_dump_stack;
extern struct ratelimit_state printk_ratelimit_state;
-#ifdef CONFIG_RT_MUTEXES
-extern int max_lock_depth;
-#endif
-
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -202,9 +194,6 @@ extern struct ctl_table epoll_table[];
int sysctl_legacy_va_layout;
#endif
-extern int prove_locking;
-extern int lock_stat;
-
/* The default sysctl tables: */
static struct ctl_table root_table[] = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f663d23e85e..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -592,6 +592,10 @@ static inline void clocksource_select(void) { }
*/
static int __init clocksource_done_booting(void)
{
+ mutex_lock(&clocksource_mutex);
+ curr_clocksource = clocksource_default_clock();
+ mutex_unlock(&clocksource_mutex);
+
finished_booting = 1;
/*
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d00c6fe23f54..78edc6490038 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o
obj-$(CONFIG_EVENT_TRACING) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
ifeq ($(CONFIG_PERF_EVENTS),y)
-obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83783579378f..d9062f5cc0c0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/hash.h>
+#include <linux/rcupdate.h>
#include <trace/events/sched.h>
@@ -84,22 +85,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
-#endif
-
+/*
+ * Traverse the ftrace_list, invoking all entries. The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism. The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
{
- struct ftrace_ops *op = ftrace_list;
-
- /* in case someone actually ports this to alpha! */
- read_barrier_depends();
+ struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
while (op != &ftrace_list_end) {
- /* silly alpha */
- read_barrier_depends();
op->func(ip, parent_ip);
- op = op->next;
+ op = rcu_dereference_raw(op->next); /*see above*/
};
}
@@ -154,8 +155,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
* the ops->next pointer is valid before another CPU sees
* the ops pointer included into the ftrace_list.
*/
- smp_wmb();
- ftrace_list = ops;
+ rcu_assign_pointer(ftrace_list, ops);
if (ftrace_enabled) {
ftrace_func_t func;
@@ -2276,6 +2276,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -3351,6 +3353,7 @@ void ftrace_graph_init_task(struct task_struct *t)
{
/* Make sure we do not use the parent ret_stack */
t->ret_stack = NULL;
+ t->curr_ret_stack = -1;
if (ftrace_graph_active) {
struct ftrace_ret_stack *ret_stack;
@@ -3360,7 +3363,6 @@ void ftrace_graph_init_task(struct task_struct *t)
GFP_KERNEL);
if (!ret_stack)
return;
- t->curr_ret_stack = -1;
atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->ftrace_timestamp = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0287f9f52f5a..05a9f83b8819 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2233,12 +2233,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
if (ring_buffer_flags != RB_BUFFERS_ON)
return NULL;
- if (atomic_read(&buffer->record_disabled))
- return NULL;
-
/* If we are tracing schedule, we don't want to recurse */
resched = ftrace_preempt_disable();
+ if (atomic_read(&buffer->record_disabled))
+ goto out_nocheck;
+
if (trace_recursive_lock())
goto out_nocheck;
@@ -2470,11 +2470,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
if (ring_buffer_flags != RB_BUFFERS_ON)
return -EBUSY;
- if (atomic_read(&buffer->record_disabled))
- return -EBUSY;
-
resched = ftrace_preempt_disable();
+ if (atomic_read(&buffer->record_disabled))
+ goto out;
+
cpu = raw_smp_processor_id();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2542,7 +2542,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
* @buffer: The ring buffer to enable writes
*
* Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
*/
void ring_buffer_record_enable(struct ring_buffer *buffer)
{
@@ -2578,7 +2578,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
* @cpu: The CPU to enable.
*
* Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
*/
void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
{
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ed01fdba4a55..3ec2ee6f6560 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -374,6 +374,21 @@ static int __init set_buf_size(char *str)
}
__setup("trace_buf_size=", set_buf_size);
+static int __init set_tracing_thresh(char *str)
+{
+ unsigned long threshhold;
+ int ret;
+
+ if (!str)
+ return 0;
+ ret = strict_strtoul(str, 0, &threshhold);
+ if (ret < 0)
+ return 0;
+ tracing_thresh = threshhold * 1000;
+ return 1;
+}
+__setup("tracing_thresh=", set_tracing_thresh);
+
unsigned long nsecs_to_usecs(unsigned long nsecs)
{
return nsecs / 1000;
@@ -579,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
static arch_spinlock_t ftrace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+unsigned long __read_mostly tracing_thresh;
+
#ifdef CONFIG_TRACER_MAX_TRACE
unsigned long __read_mostly tracing_max_latency;
-unsigned long __read_mostly tracing_thresh;
/*
* Copy the new maximum trace into the separate maximum-trace
@@ -592,7 +608,7 @@ static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
struct trace_array_cpu *data = tr->data[cpu];
- struct trace_array_cpu *max_data = tr->data[cpu];
+ struct trace_array_cpu *max_data;
max_tr.cpu = cpu;
max_tr.time_start = data->preempt_timestamp;
@@ -602,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
- memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+ memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
max_data->pid = tsk->pid;
max_data->uid = task_uid(tsk);
max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -824,10 +840,10 @@ out:
mutex_unlock(&trace_types_lock);
}
-static void __tracing_reset(struct trace_array *tr, int cpu)
+static void __tracing_reset(struct ring_buffer *buffer, int cpu)
{
ftrace_disable_cpu();
- ring_buffer_reset_cpu(tr->buffer, cpu);
+ ring_buffer_reset_cpu(buffer, cpu);
ftrace_enable_cpu();
}
@@ -839,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
/* Make sure all commits have finished */
synchronize_sched();
- __tracing_reset(tr, cpu);
+ __tracing_reset(buffer, cpu);
ring_buffer_record_enable(buffer);
}
@@ -857,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
- __tracing_reset(tr, cpu);
+ __tracing_reset(buffer, cpu);
ring_buffer_record_enable(buffer);
}
@@ -934,6 +950,8 @@ void tracing_start(void)
goto out;
}
+ /* Prevent the buffers from switching */
+ arch_spin_lock(&ftrace_max_lock);
buffer = global_trace.buffer;
if (buffer)
@@ -943,6 +961,8 @@ void tracing_start(void)
if (buffer)
ring_buffer_record_enable(buffer);
+ arch_spin_unlock(&ftrace_max_lock);
+
ftrace_start();
out:
spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -964,6 +984,9 @@ void tracing_stop(void)
if (trace_stop_count++)
goto out;
+ /* Prevent the buffers from switching */
+ arch_spin_lock(&ftrace_max_lock);
+
buffer = global_trace.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
@@ -972,6 +995,8 @@ void tracing_stop(void)
if (buffer)
ring_buffer_record_disable(buffer);
+ arch_spin_unlock(&ftrace_max_lock);
+
out:
spin_unlock_irqrestore(&tracing_start_lock, flags);
}
@@ -1259,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
+ /*
+ * NMIs can not handle page faults, even with fix ups.
+ * The save user stack can (and often does) fault.
+ */
+ if (unlikely(in_nmi()))
+ return;
+
event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
sizeof(*entry), flags, pc);
if (!event)
@@ -1703,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
ftrace_enable_cpu();
+ iter->leftover = 0;
for (p = iter; p && l < *pos; p = s_next(m, p, &l))
;
@@ -4248,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_file("tracing_max_latency", 0644, d_tracer,
&tracing_max_latency, &tracing_max_lat_fops);
+#endif
trace_create_file("tracing_thresh", 0644, d_tracer,
&tracing_thresh, &tracing_max_lat_fops);
-#endif
trace_create_file("README", 0444, d_tracer,
NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd05bcaf91b0..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+extern unsigned long tracing_thresh;
+
#ifdef CONFIG_TRACER_MAX_TRACE
extern unsigned long tracing_max_latency;
-extern unsigned long tracing_thresh;
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
@@ -550,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
* struct trace_parser - servers for reading the user input separated by spaces
* @cont: set if the input is not complete - no final space char was found
* @buffer: holds the parsed user input
- * @idx: user input lenght
+ * @idx: user input length
* @size: buffer size
*/
struct trace_parser {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..6fbfb8f417b9 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
* Tracer plugins will chose a default from these clocks.
*/
#include <linux/spinlock.h>
+#include <linux/irqflags.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <linux/percpu.h>
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c
index f0d693005075..81f691eb3a30 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_perf.c
@@ -1,32 +1,36 @@
/*
- * trace event based perf counter profiling
+ * trace event based perf event profiling/tracing
*
* Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- *
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
*/
#include <linux/module.h>
#include <linux/kprobes.h>
#include "trace.h"
+DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
+EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
+
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
static char *perf_trace_buf;
static char *perf_trace_buf_nmi;
-typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
+typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ;
/* Count the events in use (per event id, not per instance) */
-static int total_profile_count;
+static int total_ref_count;
-static int ftrace_profile_enable_event(struct ftrace_event_call *event)
+static int perf_trace_event_enable(struct ftrace_event_call *event)
{
char *buf;
int ret = -ENOMEM;
- if (event->profile_count++ > 0)
+ if (event->perf_refcount++ > 0)
return 0;
- if (!total_profile_count) {
+ if (!total_ref_count) {
buf = (char *)alloc_percpu(perf_trace_t);
if (!buf)
goto fail_buf;
@@ -40,35 +44,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
rcu_assign_pointer(perf_trace_buf_nmi, buf);
}
- ret = event->profile_enable(event);
+ ret = event->perf_event_enable(event);
if (!ret) {
- total_profile_count++;
+ total_ref_count++;
return 0;
}
fail_buf_nmi:
- if (!total_profile_count) {
+ if (!total_ref_count) {
free_percpu(perf_trace_buf_nmi);
free_percpu(perf_trace_buf);
perf_trace_buf_nmi = NULL;
perf_trace_buf = NULL;
}
fail_buf:
- event->profile_count--;
+ event->perf_refcount--;
return ret;
}
-int ftrace_profile_enable(int event_id)
+int perf_trace_enable(int event_id)
{
struct ftrace_event_call *event;
int ret = -EINVAL;
mutex_lock(&event_mutex);
list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id && event->profile_enable &&
+ if (event->id == event_id && event->perf_event_enable &&
try_module_get(event->mod)) {
- ret = ftrace_profile_enable_event(event);
+ ret = perf_trace_event_enable(event);
break;
}
}
@@ -77,16 +81,16 @@ int ftrace_profile_enable(int event_id)
return ret;
}
-static void ftrace_profile_disable_event(struct ftrace_event_call *event)
+static void perf_trace_event_disable(struct ftrace_event_call *event)
{
char *buf, *nmi_buf;
- if (--event->profile_count > 0)
+ if (--event->perf_refcount > 0)
return;
- event->profile_disable(event);
+ event->perf_event_disable(event);
- if (!--total_profile_count) {
+ if (!--total_ref_count) {
buf = perf_trace_buf;
rcu_assign_pointer(perf_trace_buf, NULL);
@@ -104,14 +108,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
}
}
-void ftrace_profile_disable(int event_id)
+void perf_trace_disable(int event_id)
{
struct ftrace_event_call *event;
mutex_lock(&event_mutex);
list_for_each_entry(event, &ftrace_events, list) {
if (event->id == event_id) {
- ftrace_profile_disable_event(event);
+ perf_trace_event_disable(event);
module_put(event->mod);
break;
}
@@ -119,8 +123,8 @@ void ftrace_profile_disable(int event_id)
mutex_unlock(&event_mutex);
}
-__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
- int *rctxp, unsigned long *irq_flags)
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
+ int *rctxp, unsigned long *irq_flags)
{
struct trace_entry *entry;
char *trace_buf, *raw_data;
@@ -138,9 +142,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
cpu = smp_processor_id();
if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
+ trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
else
- trace_buf = rcu_dereference(perf_trace_buf);
+ trace_buf = rcu_dereference_sched(perf_trace_buf);
if (!trace_buf)
goto err;
@@ -161,4 +165,4 @@ err_recursion:
local_irq_restore(*irq_flags);
return NULL;
}
-EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3f972ad98d04..beab8bf2f310 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -938,7 +938,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
trace_create_file("enable", 0644, call->dir, call,
enable);
- if (call->id && call->profile_enable)
+ if (call->id && call->perf_event_enable)
trace_create_file("id", 0444, call->dir, call,
id);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3fc2a575664f..e6989d9b44da 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -237,6 +237,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
+int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+{
+ if (tracing_thresh)
+ return 1;
+ else
+ return trace_graph_entry(trace);
+}
+
static void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned long flags,
@@ -290,13 +298,26 @@ void set_graph_array(struct trace_array *tr)
smp_mb();
}
+void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+{
+ if (tracing_thresh &&
+ (trace->rettime - trace->calltime < tracing_thresh))
+ return;
+ else
+ trace_graph_return(trace);
+}
+
static int graph_trace_init(struct trace_array *tr)
{
int ret;
set_graph_array(tr);
- ret = register_ftrace_graph(&trace_graph_return,
- &trace_graph_entry);
+ if (tracing_thresh)
+ ret = register_ftrace_graph(&trace_graph_thresh_return,
+ &trace_graph_thresh_entry);
+ else
+ ret = register_ftrace_graph(&trace_graph_return,
+ &trace_graph_entry);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -920,7 +941,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
} else {
- ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
+ ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 505c92273b1a..1251e367bae9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1214,7 +1214,7 @@ static int set_print_fmt(struct trace_probe *tp)
#ifdef CONFIG_PERF_EVENTS
/* Kprobe profile handler */
-static __kprobes void kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1227,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
"profile buffer not large enough"))
return;
- entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+ entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
if (!entry)
return;
@@ -1240,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
+ perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
}
/* Kretprobe profile handler */
-static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1257,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
"profile buffer not large enough"))
return;
- entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+ entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
if (!entry)
return;
@@ -1271,10 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
+ perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+ irq_flags, regs);
}
-static int probe_profile_enable(struct ftrace_event_call *call)
+static int probe_perf_enable(struct ftrace_event_call *call)
{
struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1286,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
return enable_kprobe(&tp->rp.kp);
}
-static void probe_profile_disable(struct ftrace_event_call *call)
+static void probe_perf_disable(struct ftrace_event_call *call)
{
struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1311,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
kprobe_trace_func(kp, regs);
#ifdef CONFIG_PERF_EVENTS
if (tp->flags & TP_FLAG_PROFILE)
- kprobe_profile_func(kp, regs);
+ kprobe_perf_func(kp, regs);
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
@@ -1325,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
kretprobe_trace_func(ri, regs);
#ifdef CONFIG_PERF_EVENTS
if (tp->flags & TP_FLAG_PROFILE)
- kretprobe_profile_func(ri, regs);
+ kretprobe_perf_func(ri, regs);
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
@@ -1358,8 +1359,8 @@ static int register_probe_event(struct trace_probe *tp)
call->unregfunc = probe_event_disable;
#ifdef CONFIG_PERF_EVENTS
- call->profile_enable = probe_profile_enable;
- call->profile_disable = probe_profile_disable;
+ call->perf_event_enable = probe_perf_enable;
+ call->perf_event_disable = probe_perf_disable;
#endif
call->data = tp;
ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cba47d7935cc..33c2a5b769dc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -428,12 +428,12 @@ core_initcall(init_ftrace_syscalls);
#ifdef CONFIG_PERF_EVENTS
-static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
-static int sys_prof_refcount_enter;
-static int sys_prof_refcount_exit;
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
+static int sys_perf_refcount_enter;
+static int sys_perf_refcount_exit;
-static void prof_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
@@ -443,7 +443,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
int size;
syscall_nr = syscall_get_nr(current, regs);
- if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+ if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -455,11 +455,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
- "profile buffer not large enough"))
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+ "perf buffer not large enough"))
return;
- rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
+ rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
sys_data->enter_event->id, &rctx, &flags);
if (!rec)
return;
@@ -467,10 +467,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
}
-int prof_sysenter_enable(struct ftrace_event_call *call)
+int perf_sysenter_enable(struct ftrace_event_call *call)
{
int ret = 0;
int num;
@@ -478,34 +478,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
num = ((struct syscall_metadata *)call->data)->syscall_nr;
mutex_lock(&syscall_trace_lock);
- if (!sys_prof_refcount_enter)
- ret = register_trace_sys_enter(prof_syscall_enter);
+ if (!sys_perf_refcount_enter)
+ ret = register_trace_sys_enter(perf_syscall_enter);
if (ret) {
pr_info("event trace: Could not activate"
"syscall entry trace point");
} else {
- set_bit(num, enabled_prof_enter_syscalls);
- sys_prof_refcount_enter++;
+ set_bit(num, enabled_perf_enter_syscalls);
+ sys_perf_refcount_enter++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
-void prof_sysenter_disable(struct ftrace_event_call *call)
+void perf_sysenter_disable(struct ftrace_event_call *call)
{
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
mutex_lock(&syscall_trace_lock);
- sys_prof_refcount_enter--;
- clear_bit(num, enabled_prof_enter_syscalls);
- if (!sys_prof_refcount_enter)
- unregister_trace_sys_enter(prof_syscall_enter);
+ sys_perf_refcount_enter--;
+ clear_bit(num, enabled_perf_enter_syscalls);
+ if (!sys_perf_refcount_enter)
+ unregister_trace_sys_enter(perf_syscall_enter);
mutex_unlock(&syscall_trace_lock);
}
-static void prof_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
@@ -515,7 +515,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
int size;
syscall_nr = syscall_get_nr(current, regs);
- if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+ if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -530,11 +530,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
* Impossible, but be paranoid with the future
* How to put this check outside runtime?
*/
- if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
- "exit event has grown above profile buffer size"))
+ if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+ "exit event has grown above perf buffer size"))
return;
- rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
+ rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
sys_data->exit_event->id, &rctx, &flags);
if (!rec)
return;
@@ -542,10 +542,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
}
-int prof_sysexit_enable(struct ftrace_event_call *call)
+int perf_sysexit_enable(struct ftrace_event_call *call)
{
int ret = 0;
int num;
@@ -553,30 +553,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
num = ((struct syscall_metadata *)call->data)->syscall_nr;
mutex_lock(&syscall_trace_lock);
- if (!sys_prof_refcount_exit)
- ret = register_trace_sys_exit(prof_syscall_exit);
+ if (!sys_perf_refcount_exit)
+ ret = register_trace_sys_exit(perf_syscall_exit);
if (ret) {
pr_info("event trace: Could not activate"
"syscall exit trace point");
} else {
- set_bit(num, enabled_prof_exit_syscalls);
- sys_prof_refcount_exit++;
+ set_bit(num, enabled_perf_exit_syscalls);
+ sys_perf_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
-void prof_sysexit_disable(struct ftrace_event_call *call)
+void perf_sysexit_disable(struct ftrace_event_call *call)
{
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
mutex_lock(&syscall_trace_lock);
- sys_prof_refcount_exit--;
- clear_bit(num, enabled_prof_exit_syscalls);
- if (!sys_prof_refcount_exit)
- unregister_trace_sys_exit(prof_syscall_exit);
+ sys_perf_refcount_exit--;
+ clear_bit(num, enabled_perf_exit_syscalls);
+ if (!sys_perf_refcount_exit)
+ unregister_trace_sys_exit(perf_syscall_exit);
mutex_unlock(&syscall_trace_lock);
}