summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2012-02-03 23:12:42 +0100
committerJiri Kosina <jkosina@suse.cz>2012-02-03 23:13:05 +0100
commit972c5ae961d6e5103e2b33d935cfa4145fd47140 (patch)
tree350b2a76b979ba8766c09838617df67ff330eca0 /kernel
parent5196d20305d5e30d871111d3a876cf067dd94255 (diff)
parent7c7ed8ec337bf5f62cc5287a6eb6b2f1b7504c2f (diff)
downloadlinux-stable-972c5ae961d6e5103e2b33d935cfa4145fd47140.tar.gz
linux-stable-972c5ae961d6e5103e2b33d935cfa4145fd47140.tar.bz2
linux-stable-972c5ae961d6e5103e2b33d935cfa4145fd47140.zip
Merge branch 'master' into for-next
Sync with Linus' tree to be able to apply patch to a newer code (namely drivers/gpu/drm/gma500/psb_intel_lvds.c)
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/async.c2
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/auditfilter.c17
-rw-r--r--kernel/auditsc.c738
-rw-r--r--kernel/capability.c80
-rw-r--r--kernel/cgroup.c401
-rw-r--r--kernel/cgroup_freezer.c16
-rw-r--r--kernel/cpuset.c105
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c119
-rw-r--r--kernel/exit.c28
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/jump_label.c2
-rw-r--r--kernel/kexec.c25
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/module.c205
-rw-r--r--kernel/panic.c26
-rw-r--r--kernel/params.c38
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c31
-rw-r--r--kernel/power/process.c19
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swap.c13
-rw-r--r--kernel/power/user.c9
-rw-r--r--kernel/printk.c10
-rw-r--r--kernel/ptrace.c14
-rw-r--r--kernel/rcutorture.c8
-rw-r--r--kernel/res_counter.c28
-rw-r--r--kernel/sched/core.c59
-rw-r--r--kernel/sched/cpupri.c3
-rw-r--r--kernel/sched/fair.c44
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c74
-rw-r--r--kernel/sys.c121
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/trace/ftrace.c715
-rw-r--r--kernel/trace/trace_events_filter.c283
-rw-r--r--kernel/trace/trace_stack.c30
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c32
47 files changed, 2234 insertions, 1178 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f70396e5a24b..2d9de86b7e76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_irq_work.o = -pg
endif
obj-y += sched/
+obj-y += power/
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
-obj-$(CONFIG_PM) += power/
-obj-$(CONFIG_FREEZER) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/async.c b/kernel/async.c
index 80b74b88fefe..bd0c168a3bbe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
-extern int initcall_debug;
-
/*
* MUST be called with the lock held!
diff --git a/kernel/audit.c b/kernel/audit.c
index 09fae2677a45..bb0eb5bb9a0a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
- if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
+ if (!capable(CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
- if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
+ if (!capable(CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
- audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+ audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
pid, uid, auid, ses);
if (sid) {
rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
avail = audit_expand(ab,
max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
- goto out;
+ goto out_va_end;
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
}
- va_end(args2);
if (len > 0)
skb_put(skb, len);
+out_va_end:
+ va_end(args2);
out:
return;
}
@@ -1422,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
char *p, *pathname;
if (prefix)
- audit_log_format(ab, " %s", prefix);
+ audit_log_format(ab, "%s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/kernel/audit.h b/kernel/audit.h
index 91e7071c4d2c..816766803371 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,12 +36,8 @@ enum audit_state {
AUDIT_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
- AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
- * but don't necessarily fill it in at
- * syscall entry time (i.e., filter
- * instead). */
AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
- * and always fill it in at syscall
+ * and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f8277c80d678..a6c3f1abd206 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
switch(listnr) {
default:
goto exit_err;
- case AUDIT_FILTER_USER:
- case AUDIT_FILTER_TYPE:
#ifdef CONFIG_AUDITSYSCALL
case AUDIT_FILTER_ENTRY:
+ if (rule->action == AUDIT_ALWAYS)
+ goto exit_err;
case AUDIT_FILTER_EXIT:
case AUDIT_FILTER_TASK:
#endif
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
goto exit_free;
break;
case AUDIT_FILETYPE:
- if ((f->val & ~S_IFMT) > S_IFMT)
+ if (f->val & ~S_IFMT)
goto exit_free;
break;
case AUDIT_INODE:
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_ARG1:
case AUDIT_ARG2:
case AUDIT_ARG3:
+ case AUDIT_OBJ_UID:
+ case AUDIT_OBJ_GID:
break;
case AUDIT_ARCH:
entry->rule.arch_f = f;
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
break;
case AUDIT_FILTERKEY:
- err = -EINVAL;
if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f->val);
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
break;
case AUDIT_FILETYPE:
- if ((f->val & ~S_IFMT) > S_IFMT)
+ if (f->val & ~S_IFMT)
+ goto exit_free;
+ break;
+ case AUDIT_FIELD_COMPARE:
+ if (f->val > AUDIT_MAX_FIELD_COMPARE)
goto exit_free;
break;
default:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e7fe2b0d29b3..af1de0f34eae 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,9 +70,15 @@
#include "audit.h"
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
-#define AUDIT_NAMES 20
+ * for saving names from getname(). If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES 5
/* Indicates that audit should log the full pathname. */
#define AUDIT_NAME_FULL -1
@@ -101,9 +107,8 @@ struct audit_cap_data {
*
* Further, in fs/namei.c:path_lookup() we store the inode and device. */
struct audit_names {
+ struct list_head list; /* audit_context->names_list */
const char *name;
- int name_len; /* number of name's characters to log */
- unsigned name_put; /* call __putname() for this name */
unsigned long ino;
dev_t dev;
umode_t mode;
@@ -113,6 +118,14 @@ struct audit_names {
u32 osid;
struct audit_cap_data fcap;
unsigned int fcap_ver;
+ int name_len; /* number of name's characters to log */
+ bool name_put; /* call __putname() for this name */
+ /*
+ * This was an allocated audit_names and not from the array of
+ * names allocated in the task audit context. Thus this name
+ * should be freed on syscall exit
+ */
+ bool should_free;
};
struct audit_aux_data {
@@ -174,8 +187,17 @@ struct audit_context {
long return_code;/* syscall return code */
u64 prio;
int return_valid; /* return code is valid */
- int name_count;
- struct audit_names names[AUDIT_NAMES];
+ /*
+ * The names_list is the list of all audit_names collected during this
+ * syscall. The first AUDIT_NAMES entries in the names_list will
+ * actually be from the preallocated_names array for performance
+ * reasons. Except during allocation they should never be referenced
+ * through the preallocated_names array and should only be found/used
+ * by running the names_list.
+ */
+ struct audit_names preallocated_names[AUDIT_NAMES];
+ int name_count; /* total records in names_list */
+ struct list_head names_list; /* anchor for struct audit_names->list */
char * filterkey; /* key for rule that triggered record */
struct path pwd;
struct audit_context *previous; /* For nested syscalls */
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
}
}
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
{
- unsigned index = which & ~S_IFMT;
- umode_t mode = which & S_IFMT;
+ struct audit_names *n;
+ umode_t mode = (umode_t)val;
if (unlikely(!ctx))
return 0;
- if (index >= ctx->name_count)
- return 0;
- if (ctx->names[index].ino == -1)
- return 0;
- if ((ctx->names[index].mode ^ mode) & S_IFMT)
- return 0;
- return 1;
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if ((n->ino != -1) &&
+ ((n->mode & S_IFMT) == mode))
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
return 0;
}
+static int audit_compare_id(uid_t uid1,
+ struct audit_names *name,
+ unsigned long name_offset,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ unsigned long addr;
+ uid_t uid2;
+ int rc;
+
+ BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+
+ if (name) {
+ addr = (unsigned long)name;
+ addr += name_offset;
+
+ uid2 = *(uid_t *)addr;
+ rc = audit_comparator(uid1, f->op, uid2);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ addr = (unsigned long)n;
+ addr += name_offset;
+
+ uid2 = *(uid_t *)addr;
+
+ rc = audit_comparator(uid1, f->op, uid2);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int audit_field_compare(struct task_struct *tsk,
+ const struct cred *cred,
+ struct audit_field *f,
+ struct audit_context *ctx,
+ struct audit_names *name)
+{
+ switch (f->val) {
+ /* process to file object comparisons */
+ case AUDIT_COMPARE_UID_TO_OBJ_UID:
+ return audit_compare_id(cred->uid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_GID_TO_OBJ_GID:
+ return audit_compare_id(cred->gid,
+ name, offsetof(struct audit_names, gid),
+ f, ctx);
+ case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+ return audit_compare_id(cred->euid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+ return audit_compare_id(cred->egid,
+ name, offsetof(struct audit_names, gid),
+ f, ctx);
+ case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+ return audit_compare_id(tsk->loginuid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+ return audit_compare_id(cred->suid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+ return audit_compare_id(cred->sgid,
+ name, offsetof(struct audit_names, gid),
+ f, ctx);
+ case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+ return audit_compare_id(cred->fsuid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+ return audit_compare_id(cred->fsgid,
+ name, offsetof(struct audit_names, gid),
+ f, ctx);
+ /* uid comparisons */
+ case AUDIT_COMPARE_UID_TO_AUID:
+ return audit_comparator(cred->uid, f->op, tsk->loginuid);
+ case AUDIT_COMPARE_UID_TO_EUID:
+ return audit_comparator(cred->uid, f->op, cred->euid);
+ case AUDIT_COMPARE_UID_TO_SUID:
+ return audit_comparator(cred->uid, f->op, cred->suid);
+ case AUDIT_COMPARE_UID_TO_FSUID:
+ return audit_comparator(cred->uid, f->op, cred->fsuid);
+ /* auid comparisons */
+ case AUDIT_COMPARE_AUID_TO_EUID:
+ return audit_comparator(tsk->loginuid, f->op, cred->euid);
+ case AUDIT_COMPARE_AUID_TO_SUID:
+ return audit_comparator(tsk->loginuid, f->op, cred->suid);
+ case AUDIT_COMPARE_AUID_TO_FSUID:
+ return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+ /* euid comparisons */
+ case AUDIT_COMPARE_EUID_TO_SUID:
+ return audit_comparator(cred->euid, f->op, cred->suid);
+ case AUDIT_COMPARE_EUID_TO_FSUID:
+ return audit_comparator(cred->euid, f->op, cred->fsuid);
+ /* suid comparisons */
+ case AUDIT_COMPARE_SUID_TO_FSUID:
+ return audit_comparator(cred->suid, f->op, cred->fsuid);
+ /* gid comparisons */
+ case AUDIT_COMPARE_GID_TO_EGID:
+ return audit_comparator(cred->gid, f->op, cred->egid);
+ case AUDIT_COMPARE_GID_TO_SGID:
+ return audit_comparator(cred->gid, f->op, cred->sgid);
+ case AUDIT_COMPARE_GID_TO_FSGID:
+ return audit_comparator(cred->gid, f->op, cred->fsgid);
+ /* egid comparisons */
+ case AUDIT_COMPARE_EGID_TO_SGID:
+ return audit_comparator(cred->egid, f->op, cred->sgid);
+ case AUDIT_COMPARE_EGID_TO_FSGID:
+ return audit_comparator(cred->egid, f->op, cred->fsgid);
+ /* sgid comparison */
+ case AUDIT_COMPARE_SGID_TO_FSGID:
+ return audit_comparator(cred->sgid, f->op, cred->fsgid);
+ default:
+ WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
+ return 0;
+ }
+ return 0;
+}
+
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise.
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk,
bool task_creation)
{
const struct cred *cred;
- int i, j, need_sid = 1;
+ int i, need_sid = 1;
u32 sid;
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ struct audit_names *n;
int result = 0;
switch (f->type) {
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMAJOR:
- if (name)
- result = audit_comparator(MAJOR(name->dev),
- f->op, f->val);
- else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
+ if (name) {
+ if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
++result;
break;
}
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMINOR:
- if (name)
- result = audit_comparator(MINOR(name->dev),
- f->op, f->val);
- else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+ if (name) {
+ if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+ audit_comparator(MINOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+ audit_comparator(MINOR(n->rdev), f->op, f->val)) {
++result;
break;
}
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk,
if (name)
result = (name->ino == f->val);
else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->ino, f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_UID:
+ if (name) {
+ result = audit_comparator(name->uid, f->op, f->val);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->uid, f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_GID:
+ if (name) {
+ result = audit_comparator(name->gid, f->op, f->val);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->gid, f->op, f->val)) {
++result;
break;
}
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk,
name->osid, f->type, f->op,
f->lsm_rule, ctx);
} else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (security_audit_rule_match(
- ctx->names[j].osid,
- f->type, f->op,
- f->lsm_rule, ctx)) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (security_audit_rule_match(n->osid, f->type,
+ f->op, f->lsm_rule,
+ ctx)) {
++result;
break;
}
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
break;
+ case AUDIT_FIELD_COMPARE:
+ result = audit_field_compare(tsk, cred, f, ctx, name);
+ break;
}
-
if (!result)
return 0;
}
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
return AUDIT_BUILD_CONTEXT;
}
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+ struct audit_names *n,
+ struct audit_context *ctx) {
+ int word, bit;
+ int h = audit_hash_ino((u32)n->ino);
+ struct list_head *list = &audit_inode_hash[h];
+ struct audit_entry *e;
+ enum audit_state state;
+
+ word = AUDIT_WORD(ctx->major);
+ bit = AUDIT_BIT(ctx->major);
+
+ if (list_empty(list))
+ return 0;
+
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit &&
+ audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
* collected during syscall processing. We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
* Regarding audit_state, same rules apply as for audit_filter_syscall().
*/
void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
- int i;
- struct audit_entry *e;
- enum audit_state state;
+ struct audit_names *n;
if (audit_pid && tsk->tgid == audit_pid)
return;
rcu_read_lock();
- for (i = 0; i < ctx->name_count; i++) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
- struct audit_names *n = &ctx->names[i];
- int h = audit_hash_ino((u32)n->ino);
- struct list_head *list = &audit_inode_hash[h];
-
- if (list_empty(list))
- continue;
- list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
- audit_filter_rules(tsk, &e->rule, ctx, n,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_filter_inode_name(tsk, n, ctx))
+ break;
}
rcu_read_unlock();
}
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
{
struct audit_context *context = tsk->audit_context;
- if (likely(!context))
+ if (!context)
return NULL;
context->return_valid = return_valid;
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
static inline void audit_free_names(struct audit_context *context)
{
- int i;
+ struct audit_names *n, *next;
#if AUDIT_DEBUG == 2
if (context->put_count + context->ino_count != context->name_count) {
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context)
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
- for (i = 0; i < context->name_count; i++) {
+ list_for_each_entry(n, &context->names_list, list) {
printk(KERN_ERR "names[%d] = %p = %s\n", i,
- context->names[i].name,
- context->names[i].name ?: "(null)");
+ n->name, n->name ?: "(null)");
}
dump_stack();
return;
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context)
context->ino_count = 0;
#endif
- for (i = 0; i < context->name_count; i++) {
- if (context->names[i].name && context->names[i].name_put)
- __putname(context->names[i].name);
+ list_for_each_entry_safe(n, next, &context->names_list, list) {
+ list_del(&n->list);
+ if (n->name && n->name_put)
+ __putname(n->name);
+ if (n->should_free)
+ kfree(n);
}
context->name_count = 0;
path_put(&context->pwd);
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
return NULL;
audit_zero_context(context, state);
INIT_LIST_HEAD(&context->killed_trees);
+ INIT_LIST_HEAD(&context->names_list);
return context;
}
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk)
return 0; /* Return if not auditing. */
state = audit_filter_task(tsk, &key);
- if (likely(state == AUDIT_DISABLED))
+ if (state == AUDIT_DISABLED)
return 0;
if (!(context = audit_alloc_context(state))) {
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
while (vma) {
if ((vma->vm_flags & VM_EXECUTABLE) &&
vma->vm_file) {
- audit_log_d_path(ab, "exe=",
+ audit_log_d_path(ab, " exe=",
&vma->vm_file->f_path);
break;
}
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
struct audit_buffer **ab,
struct audit_aux_data_execve *axi)
{
- int i;
- size_t len, len_sent = 0;
+ int i, len;
+ size_t len_sent = 0;
const char __user *p;
char *buf;
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
}
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+ int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return; /* audit_panic has been called */
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#ho"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ n->uid,
+ n->gid,
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ audit_log_fcaps(ab, n);
+
+ audit_log_end(ab);
+}
+
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
const struct cred *cred;
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
struct audit_buffer *ab;
struct audit_aux_data *aux;
const char *tty;
+ struct audit_names *n;
/* tsk == current */
context->pid = tsk->pid;
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, "cwd=", &context->pwd);
+ audit_log_d_path(ab, " cwd=", &context->pwd);
audit_log_end(ab);
}
}
- for (i = 0; i < context->name_count; i++) {
- struct audit_names *n = &context->names[i];
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- continue; /* audit_panic has been called */
-
- audit_log_format(ab, "item=%d", i);
-
- if (n->name) {
- switch(n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, "name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name,
- n->name_len);
- }
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != (unsigned long)-1) {
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#ho"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- n->uid,
- n->gid,
- MAJOR(n->rdev),
- MINOR(n->rdev));
- }
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
-
- audit_log_fcaps(ab, n);
-
- audit_log_end(ab);
- }
+ i = 0;
+ list_for_each_entry(n, &context->names_list, list)
+ audit_log_name(context, n, i++, &call_panic);
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
*
* Called from copy_process and do_exit
*/
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
{
struct audit_context *context;
context = audit_get_context(tsk, 0, 0);
- if (likely(!context))
+ if (!context)
return;
/* Check for system calls that do not go through the exit
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk)
* will only be written if another part of the kernel requests that it
* be written).
*/
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
{
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major,
struct audit_context *context = tsk->audit_context;
enum audit_state state;
- if (unlikely(!context))
+ if (!context)
return;
/*
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major,
context->prio = 0;
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
}
- if (likely(state == AUDIT_DISABLED))
+ if (state == AUDIT_DISABLED)
return;
context->serial = 0;
@@ -1658,45 +1861,29 @@ void audit_syscall_entry(int arch, int major,
context->ppid = 0;
}
-void audit_finish_fork(struct task_struct *child)
-{
- struct audit_context *ctx = current->audit_context;
- struct audit_context *p = child->audit_context;
- if (!p || !ctx)
- return;
- if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
- return;
- p->arch = ctx->arch;
- p->major = ctx->major;
- memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
- p->ctime = ctx->ctime;
- p->dummy = ctx->dummy;
- p->in_syscall = ctx->in_syscall;
- p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
- p->ppid = current->pid;
- p->prio = ctx->prio;
- p->current_state = ctx->current_state;
-}
-
/**
* audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @success: success value of the syscall
+ * @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
* auditable (either because of the AUDIT_RECORD_CONTEXT state from
- * filtering, or because some other part of the kernel write an audit
+ * filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
*/
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
{
struct task_struct *tsk = current;
struct audit_context *context;
- context = audit_get_context(tsk, valid, return_code);
+ if (success)
+ success = AUDITSC_SUCCESS;
+ else
+ success = AUDITSC_FAILURE;
- if (likely(!context))
+ context = audit_get_context(tsk, success, return_code);
+ if (!context)
return;
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1821,6 +2008,30 @@ retry:
#endif
}
+static struct audit_names *audit_alloc_name(struct audit_context *context)
+{
+ struct audit_names *aname;
+
+ if (context->name_count < AUDIT_NAMES) {
+ aname = &context->preallocated_names[context->name_count];
+ memset(aname, 0, sizeof(*aname));
+ } else {
+ aname = kzalloc(sizeof(*aname), GFP_NOFS);
+ if (!aname)
+ return NULL;
+ aname->should_free = true;
+ }
+
+ aname->ino = (unsigned long)-1;
+ list_add_tail(&aname->list, &context->names_list);
+
+ context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ return aname;
+}
+
/**
* audit_getname - add a name to the list
* @name: name to add
@@ -1831,9 +2042,7 @@ retry:
void __audit_getname(const char *name)
{
struct audit_context *context = current->audit_context;
-
- if (IS_ERR(name) || !name)
- return;
+ struct audit_names *n;
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
@@ -1843,13 +2052,15 @@ void __audit_getname(const char *name)
#endif
return;
}
- BUG_ON(context->name_count >= AUDIT_NAMES);
- context->names[context->name_count].name = name;
- context->names[context->name_count].name_len = AUDIT_NAME_FULL;
- context->names[context->name_count].name_put = 1;
- context->names[context->name_count].ino = (unsigned long)-1;
- context->names[context->name_count].osid = 0;
- ++context->name_count;
+
+ n = audit_alloc_name(context);
+ if (!n)
+ return;
+
+ n->name = name;
+ n->name_len = AUDIT_NAME_FULL;
+ n->name_put = true;
+
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
}
@@ -1871,12 +2082,13 @@ void audit_putname(const char *name)
printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
+ struct audit_names *n;
int i;
- for (i = 0; i < context->name_count; i++)
+
+ list_for_each_entry(n, &context->names_list, list)
printk(KERN_ERR "name[%d] = %p = %s\n", i,
- context->names[i].name,
- context->names[i].name ?: "(null)");
- }
+ n->name, n->name ?: "(null)");
+ }
#endif
__putname(name);
}
@@ -1897,39 +2109,11 @@ void audit_putname(const char *name)
#endif
}
-static int audit_inc_name_count(struct audit_context *context,
- const struct inode *inode)
-{
- if (context->name_count >= AUDIT_NAMES) {
- if (inode)
- printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
- "dev=%02x:%02x, inode=%lu\n",
- MAJOR(inode->i_sb->s_dev),
- MINOR(inode->i_sb->s_dev),
- inode->i_ino);
-
- else
- printk(KERN_DEBUG "name_count maxed, losing inode data\n");
- return 1;
- }
- context->name_count++;
-#if AUDIT_DEBUG
- context->ino_count++;
-#endif
- return 0;
-}
-
-
static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
{
struct cpu_vfs_cap_data caps;
int rc;
- memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
- memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
- name->fcap.fE = 0;
- name->fcap_ver = 0;
-
if (!dentry)
return 0;
@@ -1969,30 +2153,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
*/
void __audit_inode(const char *name, const struct dentry *dentry)
{
- int idx;
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
+ struct audit_names *n;
if (!context->in_syscall)
return;
- if (context->name_count
- && context->names[context->name_count-1].name
- && context->names[context->name_count-1].name == name)
- idx = context->name_count - 1;
- else if (context->name_count > 1
- && context->names[context->name_count-2].name
- && context->names[context->name_count-2].name == name)
- idx = context->name_count - 2;
- else {
- /* FIXME: how much do we care about inodes that have no
- * associated name? */
- if (audit_inc_name_count(context, inode))
- return;
- idx = context->name_count - 1;
- context->names[idx].name = NULL;
+
+ list_for_each_entry_reverse(n, &context->names_list, list) {
+ if (n->name && (n->name == name))
+ goto out;
}
+
+ /* unable to find the name from a previous getname() */
+ n = audit_alloc_name(context);
+ if (!n)
+ return;
+out:
handle_path(dentry);
- audit_copy_inode(&context->names[idx], dentry, inode);
+ audit_copy_inode(n, dentry, inode);
}
/**
@@ -2011,11 +2190,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
void __audit_inode_child(const struct dentry *dentry,
const struct inode *parent)
{
- int idx;
struct audit_context *context = current->audit_context;
const char *found_parent = NULL, *found_child = NULL;
const struct inode *inode = dentry->d_inode;
const char *dname = dentry->d_name.name;
+ struct audit_names *n;
int dirlen = 0;
if (!context->in_syscall)
@@ -2025,9 +2204,7 @@ void __audit_inode_child(const struct dentry *dentry,
handle_one(inode);
/* parent is more likely, look for it first */
- for (idx = 0; idx < context->name_count; idx++) {
- struct audit_names *n = &context->names[idx];
-
+ list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
@@ -2040,9 +2217,7 @@ void __audit_inode_child(const struct dentry *dentry,
}
/* no matching parent, look for matching child */
- for (idx = 0; idx < context->name_count; idx++) {
- struct audit_names *n = &context->names[idx];
-
+ list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
@@ -2060,34 +2235,29 @@ void __audit_inode_child(const struct dentry *dentry,
add_names:
if (!found_parent) {
- if (audit_inc_name_count(context, parent))
+ n = audit_alloc_name(context);
+ if (!n)
return;
- idx = context->name_count - 1;
- context->names[idx].name = NULL;
- audit_copy_inode(&context->names[idx], NULL, parent);
+ audit_copy_inode(n, NULL, parent);
}
if (!found_child) {
- if (audit_inc_name_count(context, inode))
+ n = audit_alloc_name(context);
+ if (!n)
return;
- idx = context->name_count - 1;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
- context->names[idx].name = found_parent;
- context->names[idx].name_len = AUDIT_NAME_FULL;
+ n->name = found_parent;
+ n->name_len = AUDIT_NAME_FULL;
/* don't call __putname() */
- context->names[idx].name_put = 0;
- } else {
- context->names[idx].name = NULL;
+ n->name_put = false;
}
if (inode)
- audit_copy_inode(&context->names[idx], NULL, inode);
- else
- context->names[idx].ino = (unsigned long)-1;
+ audit_copy_inode(n, NULL, inode);
}
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2121,19 +2291,28 @@ int auditsc_get_stamp(struct audit_context *ctx,
static atomic_t session_id = ATOMIC_INIT(0);
/**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
* @loginuid: loginuid value
*
* Returns 0.
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(uid_t loginuid)
{
- unsigned int sessionid = atomic_inc_return(&session_id);
+ struct task_struct *task = current;
struct audit_context *context = task->audit_context;
+ unsigned int sessionid;
+
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+ if (task->loginuid != -1)
+ return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+ sessionid = atomic_inc_return(&session_id);
if (context && context->in_syscall) {
struct audit_buffer *ab;
@@ -2271,14 +2450,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
context->ipc.has_perm = 1;
}
-int audit_bprm(struct linux_binprm *bprm)
+int __audit_bprm(struct linux_binprm *bprm)
{
struct audit_aux_data_execve *ax;
struct audit_context *context = current->audit_context;
- if (likely(!audit_enabled || !context || context->dummy))
- return 0;
-
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
if (!ax)
return -ENOMEM;
@@ -2299,13 +2475,10 @@ int audit_bprm(struct linux_binprm *bprm)
* @args: args array
*
*/
-void audit_socketcall(int nargs, unsigned long *args)
+void __audit_socketcall(int nargs, unsigned long *args)
{
struct audit_context *context = current->audit_context;
- if (likely(!context || context->dummy))
- return;
-
context->type = AUDIT_SOCKETCALL;
context->socketcall.nargs = nargs;
memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2331,13 +2504,10 @@ void __audit_fd_pair(int fd1, int fd2)
*
* Returns 0 for success or NULL context or < 0 on error.
*/
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
{
struct audit_context *context = current->audit_context;
- if (likely(!context || context->dummy))
- return 0;
-
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
if (!p)
@@ -2499,6 +2669,25 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+ uid_t auid, uid;
+ gid_t gid;
+ unsigned int sessionid;
+
+ auid = audit_get_loginuid(current);
+ sessionid = audit_get_sessionid(current);
+ current_uid_gid(&uid, &gid);
+
+ audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+ auid, uid, gid, sessionid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " pid=%d comm=", current->pid);
+ audit_log_untrustedstring(ab, current->comm);
+ audit_log_format(ab, " reason=");
+ audit_log_string(ab, reason);
+ audit_log_format(ab, " sig=%ld", signr);
+}
/**
* audit_core_dumps - record information about processes that end abnormally
* @signr: signal value
@@ -2509,10 +2698,6 @@ void __audit_mmap_fd(int fd, int flags)
void audit_core_dumps(long signr)
{
struct audit_buffer *ab;
- u32 sid;
- uid_t auid = audit_get_loginuid(current), uid;
- gid_t gid;
- unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -2521,24 +2706,17 @@ void audit_core_dumps(long signr)
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- current_uid_gid(&uid, &gid);
- audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
- auid, uid, gid, sessionid);
- security_task_getsecid(current, &sid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
+ audit_log_abend(ab, "memory violation", signr);
+ audit_log_end(ab);
+}
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
- audit_log_format(ab, " pid=%d comm=", current->pid);
- audit_log_untrustedstring(ab, current->comm);
- audit_log_format(ab, " sig=%ld", signr);
+void __audit_seccomp(unsigned long syscall)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+ audit_log_abend(ab, "seccomp", SIGKILL);
+ audit_log_format(ab, " syscall=%ld", syscall);
audit_log_end(ab);
}
diff --git a/kernel/capability.c b/kernel/capability.c
index b463871a4e69..3f1adb6c6470 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -287,74 +287,84 @@ error:
}
/**
- * has_capability - Does a task have a capability in init_user_ns
+ * has_ns_capability - Does a task have a capability in a specific user ns
* @t: The task in question
+ * @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
+ * currently in effect to the specified user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
-bool has_capability(struct task_struct *t, int cap)
+bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
{
- int ret = security_real_capable(t, &init_user_ns, cap);
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable(__task_cred(t), ns, cap);
+ rcu_read_unlock();
return (ret == 0);
}
/**
- * has_capability - Does a task have a capability in a specific user ns
+ * has_capability - Does a task have a capability in init_user_ns
* @t: The task in question
- * @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
- * currently in effect to the specified user namespace, false if not.
+ * currently in effect to the initial user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
-bool has_ns_capability(struct task_struct *t,
- struct user_namespace *ns, int cap)
+bool has_capability(struct task_struct *t, int cap)
{
- int ret = security_real_capable(t, ns, cap);
-
- return (ret == 0);
+ return has_ns_capability(t, &init_user_ns, cap);
}
/**
- * has_capability_noaudit - Does a task have a capability (unaudited)
+ * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * in a specific user ns.
* @t: The task in question
+ * @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
- * currently in effect to init_user_ns, false if not. Don't write an
- * audit message for the check.
+ * currently in effect to the specified user namespace, false if not.
+ * Do not write an audit message for the check.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
-bool has_capability_noaudit(struct task_struct *t, int cap)
+bool has_ns_capability_noaudit(struct task_struct *t,
+ struct user_namespace *ns, int cap)
{
- int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ rcu_read_unlock();
return (ret == 0);
}
/**
- * capable - Determine if the current task has a superior capability in effect
+ * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * initial user ns
+ * @t: The task in question
* @cap: The capability to be tested for
*
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not. Don't write an
+ * audit message for the check.
*
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
+ * Note that this does not set PF_SUPERPRIV on the task.
*/
-bool capable(int cap)
+bool has_capability_noaudit(struct task_struct *t, int cap)
{
- return ns_capable(&init_user_ns, cap);
+ return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
-EXPORT_SYMBOL(capable);
/**
* ns_capable - Determine if the current task has a superior capability in effect
@@ -374,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
BUG();
}
- if (security_capable(ns, current_cred(), cap) == 0) {
+ if (security_capable(current_cred(), ns, cap) == 0) {
current->flags |= PF_SUPERPRIV;
return true;
}
@@ -383,18 +393,20 @@ bool ns_capable(struct user_namespace *ns, int cap)
EXPORT_SYMBOL(ns_capable);
/**
- * task_ns_capable - Determine whether current task has a superior
- * capability targeted at a specific task's user namespace.
- * @t: The task whose user namespace is targeted.
- * @cap: The capability in question.
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
*
- * Return true if it does, false otherwise.
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
*/
-bool task_ns_capable(struct task_struct *t, int cap)
+bool capable(int cap)
{
- return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+ return ns_capable(&init_user_ns, cap);
}
-EXPORT_SYMBOL(task_ns_capable);
+EXPORT_SYMBOL(capable);
/**
* nsown_capable - Check superior capability to one's own user_ns
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7cab65f83f1d..a5d3b5325f77 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
#include <linux/atomic.h>
+/*
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
+ * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
+ * release_agent_path and so on. Modifying requires both cgroup_mutex and
+ * cgroup_root_mutex. Readers can acquire either of the two. This is to
+ * break the following locking order cycle.
+ *
+ * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
+ * B. namespace_sem -> cgroup_mutex
+ *
+ * B happens only through cgroup_show_options() and using cgroup_root_mutex
+ * breaks it.
+ */
static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_MUTEX(cgroup_root_mutex);
/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
*
* CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
*/
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
{
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
int i;
BUG_ON(!mutex_is_locked(&cgroup_mutex));
+ BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
removed_bits = root->actual_subsys_bits & ~final_bits;
added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1043,7 +1061,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
struct cgroup_subsys *ss;
- mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
for_each_subsys(root, ss)
seq_printf(seq, ",%s", ss->name);
if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_root_mutex);
return 0;
}
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/*
* If the 'all' option was specified select all the subsystems,
- * otherwise 'all, 'none' and a subsystem name options were not
- * specified, let's default to 'all'
+ * otherwise if 'none', 'name=' and a subsystem name options
+ * were not specified, let's default to 'all'
*/
- if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+ if (all_ss || (!one_ss && !opts->none && !opts->name)) {
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int ret = 0;
struct super_block *sb;
struct cgroupfs_root *new_root;
+ struct inode *inode;
/* First find the desired set of subsystems */
mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
/* We used the new root structure, so this is a new hierarchy */
struct list_head tmp_cg_links;
struct cgroup *root_cgrp = &root->top_cgroup;
- struct inode *inode;
struct cgroupfs_root *existing_root;
const struct cred *cred;
int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
- if (strlen(root->name)) {
- /* Check for name clashes with existing mounts */
- for_each_active_root(existing_root) {
- if (!strcmp(existing_root->name, root->name)) {
- ret = -EBUSY;
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
- }
- }
- }
+ /* Check for name clashes with existing mounts */
+ ret = -EBUSY;
+ if (strlen(root->name))
+ for_each_active_root(existing_root)
+ if (!strcmp(existing_root->name, root->name))
+ goto unlock_drop;
/*
* We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
* have some link structures left over
*/
ret = allocate_cg_links(css_set_count, &tmp_cg_links);
- if (ret) {
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
- }
+ if (ret)
+ goto unlock_drop;
ret = rebind_subsystems(root, root->subsys_bits);
if (ret == -EBUSY) {
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
free_cg_links(&tmp_cg_links);
- goto drop_new_super;
+ goto unlock_drop;
}
/*
* There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
cred = override_creds(&init_cred);
cgroup_populate_dir(root_cgrp);
revert_creds(cred);
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
} else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
kfree(opts.name);
return dget(sb->s_root);
+ unlock_drop:
+ mutex_unlock(&cgroup_root_mutex);
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
drop_new_super:
deactivate_locked_super(sb);
drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
BUG_ON(!list_empty(&cgrp->sibling));
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
/* Rebind all subsystems back to the default hierarchy */
ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
root_count--;
}
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
EXPORT_SYMBOL_GPL(cgroup_path);
/*
+ * Control Group taskset
+ */
+struct task_and_cgroup {
+ struct task_struct *task;
+ struct cgroup *cgrp;
+};
+
+struct cgroup_taskset {
+ struct task_and_cgroup single;
+ struct flex_array *tc_array;
+ int tc_array_len;
+ int idx;
+ struct cgroup *cur_cgrp;
+};
+
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
+{
+ if (tset->tc_array) {
+ tset->idx = 0;
+ return cgroup_taskset_next(tset);
+ } else {
+ tset->cur_cgrp = tset->single.cgrp;
+ return tset->single.task;
+ }
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
+
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset. Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
+{
+ struct task_and_cgroup *tc;
+
+ if (!tset->tc_array || tset->idx >= tset->tc_array_len)
+ return NULL;
+
+ tc = flex_array_get(tset->tc_array, tset->idx++);
+ tset->cur_cgrp = tc->cgrp;
+ return tc->task;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+
+/**
+ * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
+ * @tset: taskset of interest
+ *
+ * Return the cgroup for the current (last returned) task of @tset. This
+ * function must be preceded by either cgroup_taskset_first() or
+ * cgroup_taskset_next().
+ */
+struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
+{
+ return tset->cur_cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+
+/**
+ * cgroup_taskset_size - return the number of tasks in taskset
+ * @tset: taskset of interest
+ */
+int cgroup_taskset_size(struct cgroup_taskset *tset)
+{
+ return tset->tc_array ? tset->tc_array_len : 1;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_size);
+
+
+/*
* cgroup_task_migrate - move a task from one cgroup to another.
*
* 'guarantee' is set if the caller promises that a new css_set for the task
* will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
*/
static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
struct css_set *newcg;
/*
- * get old css_set. we need to take task_lock and refcount it, because
- * an exiting task can change its css_set to init_css_set and drop its
- * old one without taking cgroup_mutex.
+ * We are synchronized through threadgroup_lock() against PF_EXITING
+ * setting such that we can't race against cgroup_exit() changing the
+ * css_set to init_css_set and dropping the old one.
*/
- task_lock(tsk);
+ WARN_ON_ONCE(tsk->flags & PF_EXITING);
oldcg = tsk->cgroups;
- get_css_set(oldcg);
- task_unlock(tsk);
/* locate or allocate a new css_set for this task. */
if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
might_sleep();
/* find_css_set will give us newcg already referenced. */
newcg = find_css_set(oldcg, cgrp);
- if (!newcg) {
- put_css_set(oldcg);
+ if (!newcg)
return -ENOMEM;
- }
}
- put_css_set(oldcg);
- /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- return -ESRCH;
- }
rcu_assign_pointer(tsk->cgroups, newcg);
task_unlock(tsk);
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
* @cgrp: the cgroup the task is attaching to
* @tsk: the task to be attached
*
- * Call holding cgroup_mutex. May take task_lock of
- * the task 'tsk' during call.
+ * Call with cgroup_mutex and threadgroup locked. May take task_lock of
+ * @tsk during call.
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
struct cgroupfs_root *root = cgrp->root;
+ struct cgroup_taskset tset = { };
+
+ /* @tsk either already exited or can't exit until the end */
+ if (tsk->flags & PF_EXITING)
+ return -ESRCH;
/* Nothing to do if the task is already in that cgroup */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
return 0;
+ tset.single.task = tsk;
+ tset.single.cgrp = oldcgrp;
+
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk);
+ retval = ss->can_attach(ss, cgrp, &tset);
if (retval) {
/*
* Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
}
- if (ss->can_attach_task) {
- retval = ss->can_attach_task(cgrp, tsk);
- if (retval) {
- failed_ss = ss;
- goto out;
- }
- }
}
retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
for_each_subsys(root, ss) {
- if (ss->pre_attach)
- ss->pre_attach(cgrp);
- if (ss->attach_task)
- ss->attach_task(cgrp, tsk);
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk);
+ ss->attach(ss, cgrp, &tset);
}
synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
*/
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk);
+ ss->cancel_attach(ss, cgrp, &tset);
}
}
return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
read_lock(&css_set_lock);
newcg = find_existing_css_set(cg, cgrp, template);
- if (newcg)
- get_css_set(newcg);
read_unlock(&css_set_lock);
/* doesn't exist at all? */
if (!newcg)
return false;
/* see if it's already in the list */
- list_for_each_entry(cg_entry, newcg_list, links) {
- if (cg_entry->cg == newcg) {
- put_css_set(newcg);
+ list_for_each_entry(cg_entry, newcg_list, links)
+ if (cg_entry->cg == newcg)
return true;
- }
- }
/* not found */
- put_css_set(newcg);
return false;
}
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
* @cgrp: the cgroup to attach to
* @leader: the threadgroup leader task_struct of the group to be attached
*
- * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
- * take task_lock of each thread in leader's threadgroup individually in turn.
+ * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
+ * task_lock of each thread in leader's threadgroup individually in turn.
*/
-int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
{
int retval, i, group_size;
struct cgroup_subsys *ss, *failed_ss = NULL;
- bool cancel_failed_ss = false;
/* guaranteed to be initialized later, but the compiler needs this */
- struct cgroup *oldcgrp = NULL;
struct css_set *oldcg;
struct cgroupfs_root *root = cgrp->root;
/* threadgroup list cursor and array */
struct task_struct *tsk;
+ struct task_and_cgroup *tc;
struct flex_array *group;
+ struct cgroup_taskset tset = { };
/*
* we need to make sure we have css_sets for all the tasks we're
* going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* step 0: in order to do expensive, possibly blocking operations for
* every thread, we cannot iterate the thread group list, since it needs
* rcu or tasklist locked. instead, build an array of all threads in the
- * group - threadgroup_fork_lock prevents new threads from appearing,
- * and if threads exit, this will just be an over-estimate.
+ * group - group_rwsem prevents new threads from appearing, and if
+ * threads exit, this will just be an over-estimate.
*/
group_size = get_nr_threads(leader);
/* flex_array supports very large thread-groups better than kmalloc. */
- group = flex_array_alloc(sizeof(struct task_struct *), group_size,
- GFP_KERNEL);
+ group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
if (!group)
return -ENOMEM;
/* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
retval = -EAGAIN;
goto out_free_group_list;
}
- /* take a reference on each task in the group to go in the array. */
+
tsk = leader;
i = 0;
do {
+ struct task_and_cgroup ent;
+
+ /* @tsk either already exited or can't exit until the end */
+ if (tsk->flags & PF_EXITING)
+ continue;
+
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
- get_task_struct(tsk);
/*
* saying GFP_ATOMIC has no effect here because we did prealloc
* earlier, but it's good form to communicate our expectations.
*/
- retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+ ent.task = tsk;
+ ent.cgrp = task_cgroup_from_root(tsk, root);
+ /* nothing to do if this task is already in the cgroup */
+ if (ent.cgrp == cgrp)
+ continue;
+ retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
BUG_ON(retval != 0);
i++;
} while_each_thread(leader, tsk);
/* remember the number of threads in the array for later. */
group_size = i;
+ tset.tc_array = group;
+ tset.tc_array_len = group_size;
read_unlock(&tasklist_lock);
+ /* methods shouldn't be called if no task is actually migrating */
+ retval = 0;
+ if (!group_size)
+ goto out_free_group_list;
+
/*
* step 1: check that we can legitimately attach to the cgroup.
*/
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, leader);
+ retval = ss->can_attach(ss, cgrp, &tset);
if (retval) {
failed_ss = ss;
goto out_cancel_attach;
}
}
- /* a callback to be run on every thread in the threadgroup. */
- if (ss->can_attach_task) {
- /* run on each task in the threadgroup. */
- for (i = 0; i < group_size; i++) {
- tsk = flex_array_get_ptr(group, i);
- retval = ss->can_attach_task(cgrp, tsk);
- if (retval) {
- failed_ss = ss;
- cancel_failed_ss = true;
- goto out_cancel_attach;
- }
- }
- }
}
/*
@@ -2091,67 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
*/
INIT_LIST_HEAD(&newcg_list);
for (i = 0; i < group_size; i++) {
- tsk = flex_array_get_ptr(group, i);
- /* nothing to do if this task is already in the cgroup */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
- continue;
- /* get old css_set pointer */
- task_lock(tsk);
- oldcg = tsk->cgroups;
- get_css_set(oldcg);
- task_unlock(tsk);
- /* see if the new one for us is already in the list? */
- if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
- /* was already there, nothing to do. */
- put_css_set(oldcg);
- } else {
- /* we don't already have it. get new one. */
+ tc = flex_array_get(group, i);
+ oldcg = tc->task->cgroups;
+
+ /* if we don't already have it in the list get a new one */
+ if (!css_set_check_fetched(cgrp, tc->task, oldcg,
+ &newcg_list)) {
retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
- put_css_set(oldcg);
if (retval)
goto out_list_teardown;
}
}
/*
- * step 3: now that we're guaranteed success wrt the css_sets, proceed
- * to move all tasks to the new cgroup, calling ss->attach_task for each
- * one along the way. there are no failure cases after here, so this is
- * the commit point.
+ * step 3: now that we're guaranteed success wrt the css_sets,
+ * proceed to move all tasks to the new cgroup. There are no
+ * failure cases after here, so this is the commit point.
*/
- for_each_subsys(root, ss) {
- if (ss->pre_attach)
- ss->pre_attach(cgrp);
- }
for (i = 0; i < group_size; i++) {
- tsk = flex_array_get_ptr(group, i);
- /* leave current thread as it is if it's already there */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
- continue;
- /* if the thread is PF_EXITING, it can just get skipped. */
- retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
- if (retval == 0) {
- /* attach each task to each subsystem */
- for_each_subsys(root, ss) {
- if (ss->attach_task)
- ss->attach_task(cgrp, tsk);
- }
- } else {
- BUG_ON(retval != -ESRCH);
- }
+ tc = flex_array_get(group, i);
+ retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
+ BUG_ON(retval);
}
/* nothing is sensitive to fork() after this point. */
/*
- * step 4: do expensive, non-thread-specific subsystem callbacks.
- * TODO: if ever a subsystem needs to know the oldcgrp for each task
- * being moved, this call will need to be reworked to communicate that.
+ * step 4: do subsystem attach callbacks.
*/
for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, leader);
+ ss->attach(ss, cgrp, &tset);
}
/*
@@ -2171,20 +2220,12 @@ out_cancel_attach:
/* same deal as in cgroup_attach_task */
if (retval) {
for_each_subsys(root, ss) {
- if (ss == failed_ss) {
- if (cancel_failed_ss && ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, leader);
+ if (ss == failed_ss)
break;
- }
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, leader);
+ ss->cancel_attach(ss, cgrp, &tset);
}
}
- /* clean up the array of referenced threads in the group. */
- for (i = 0; i < group_size; i++) {
- tsk = flex_array_get_ptr(group, i);
- put_task_struct(tsk);
- }
out_free_group_list:
flex_array_free(group);
return retval;
@@ -2192,8 +2233,8 @@ out_free_group_list:
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will take
- * cgroup_mutex; may take task_lock of task.
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup; may take task_lock of task.
*/
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
@@ -2220,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
* detect it later.
*/
tsk = tsk->group_leader;
- } else if (tsk->flags & PF_EXITING) {
- /* optimization for the single-task-only case */
- rcu_read_unlock();
- cgroup_unlock();
- return -ESRCH;
}
-
/*
* even if we're attaching all tasks in the thread group, we
* only need to check permissions on one of them.
@@ -2249,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
get_task_struct(tsk);
}
- if (threadgroup) {
- threadgroup_fork_write_lock(tsk);
+ threadgroup_lock(tsk);
+
+ if (threadgroup)
ret = cgroup_attach_proc(cgrp, tsk);
- threadgroup_fork_write_unlock(tsk);
- } else {
+ else
ret = cgroup_attach_task(cgrp, tsk);
- }
+
+ threadgroup_unlock(tsk);
+
put_task_struct(tsk);
cgroup_unlock();
return ret;
@@ -2306,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
return -EINVAL;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
+ mutex_lock(&cgroup_root_mutex);
strcpy(cgrp->root->release_agent_path, buffer);
+ mutex_unlock(&cgroup_root_mutex);
cgroup_unlock();
return 0;
}
@@ -2789,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
}
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
+ __acquires(css_set_lock)
{
/*
* The first time anyone tries to iterate across a cgroup,
@@ -2828,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
}
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+ __releases(css_set_lock)
{
read_unlock(&css_set_lock);
}
@@ -4491,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer. cgroup_attach_task() might
- * have already changed current->cgroups, allowing the previously
- * referenced cgroup group to be removed and freed.
+ * it was not made under the protection of RCU, cgroup_mutex or
+ * threadgroup_change_begin(), so it might no longer be a valid
+ * cgroup pointer. cgroup_attach_task() might have already changed
+ * current->cgroups, allowing the previously referenced cgroup
+ * group to be removed and freed.
+ *
+ * Outside the pointer validity we also need to process the css_set
+ * inheritance between threadgoup_change_begin() and
+ * threadgoup_change_end(), this way there is no leak in any process
+ * wide migration performed by cgroup_attach_proc() that could otherwise
+ * miss a thread because it is too early or too late in the fork stage.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
- task_lock(current);
+ /*
+ * We don't need to task_lock() current because current->cgroups
+ * can't be changed concurrently here. The parent obviously hasn't
+ * exited and called cgroup_exit(), and we are synchronized against
+ * cgroup migration through threadgroup_change_begin().
+ */
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
- task_unlock(current);
INIT_LIST_HEAD(&child->cg_list);
}
@@ -4546,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
{
if (use_task_css_set_links) {
write_lock(&css_set_lock);
- task_lock(child);
- if (list_empty(&child->cg_list))
+ if (list_empty(&child->cg_list)) {
+ /*
+ * It's safe to use child->cgroups without task_lock()
+ * here because we are protected through
+ * threadgroup_change_begin() against concurrent
+ * css_set change in cgroup_task_migrate(). Also
+ * the task can't exit at that point until
+ * wake_up_new_task() is called, so we are protected
+ * against cgroup_exit() setting child->cgroup to
+ * init_css_set.
+ */
list_add(&child->cg_list, &child->cgroups->tasks);
- task_unlock(child);
+ }
write_unlock(&css_set_lock);
}
}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fcb93fca782d..fc0646b78a64 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -166,13 +166,17 @@ static bool is_task_frozen_enough(struct task_struct *task)
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task)
+ struct cgroup_taskset *tset)
{
struct freezer *freezer;
+ struct task_struct *task;
/*
* Anything frozen can't move or be moved to/from.
*/
+ cgroup_taskset_for_each(task, new_cgroup, tset)
+ if (cgroup_freezing(task))
+ return -EBUSY;
freezer = cgroup_freezer(new_cgroup);
if (freezer->state != CGROUP_THAWED)
@@ -181,11 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
return 0;
}
-static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
- return cgroup_freezing(tsk) ? -EBUSY : 0;
-}
-
static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
{
struct freezer *freezer;
@@ -381,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
- .can_attach_task = freezer_can_attach_task,
- .pre_attach = NULL,
- .attach_task = NULL,
- .attach = NULL,
.fork = freezer_fork,
- .exit = NULL,
};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0b1712dba587..a09ac2b9a661 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1389,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk)
-{
- struct cpuset *cs = cgroup_cs(cont);
-
- if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
-
- /*
- * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
- * cannot change their cpu affinity and isolating such threads by their
- * set of allowed nodes is unnecessary. Thus, cpusets are not
- * applicable for such threads. This prevents checking for success of
- * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
- * be changed.
- */
- if (tsk->flags & PF_THREAD_BOUND)
- return -EINVAL;
-
- return 0;
-}
-
-static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
-{
- return security_task_setscheduler(task);
-}
-
/*
* Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in pre_attach, and they must
- * persist among pre_attach, attach_task, and attach.
+ * dynamically allocating them is not allowed in can_attach, and they must
+ * persist until attach.
*/
static cpumask_var_t cpus_attach;
static nodemask_t cpuset_attach_nodemask_from;
static nodemask_t cpuset_attach_nodemask_to;
-/* Set-up work for before attaching each task. */
-static void cpuset_pre_attach(struct cgroup *cont)
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct task_struct *task;
+ int ret;
+
+ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ return -ENOSPC;
+
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ /*
+ * Kthreads bound to specific cpus cannot be moved to a new
+ * cpuset; we cannot change their cpu affinity and
+ * isolating such threads by their set of allowed nodes is
+ * unnecessary. Thus, cpusets are not applicable for such
+ * threads. This prevents checking for success of
+ * set_cpus_allowed_ptr() on all attached tasks before
+ * cpus_allowed may be changed.
+ */
+ if (task->flags & PF_THREAD_BOUND)
+ return -EINVAL;
+ if ((ret = security_task_setscheduler(task)))
+ return ret;
+ }
+ /* prepare for attach */
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
guarantee_online_cpus(cs, cpus_attach);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
-}
-
-/* Per-thread attachment work. */
-static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
-{
- int err;
- struct cpuset *cs = cgroup_cs(cont);
- /*
- * can_attach beforehand should guarantee that this doesn't fail.
- * TODO: have a better way to handle failure here
- */
- err = set_cpus_allowed_ptr(tsk, cpus_attach);
- WARN_ON_ONCE(err);
-
- cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, tsk);
+ return 0;
}
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk)
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
struct mm_struct *mm;
- struct cpuset *cs = cgroup_cs(cont);
- struct cpuset *oldcs = cgroup_cs(oldcont);
+ struct task_struct *task;
+ struct task_struct *leader = cgroup_taskset_first(tset);
+ struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *oldcs = cgroup_cs(oldcgrp);
+
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flag(cs, task);
+ }
/*
* Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1469,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
*/
cpuset_attach_nodemask_from = oldcs->mems_allowed;
cpuset_attach_nodemask_to = cs->mems_allowed;
- mm = get_task_mm(tsk);
+ mm = get_task_mm(leader);
if (mm) {
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
@@ -1925,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {
.create = cpuset_create,
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
- .can_attach_task = cpuset_can_attach_task,
- .pre_attach = cpuset_pre_attach,
- .attach_task = cpuset_attach_task,
.attach = cpuset_attach,
.populate = cpuset_populate,
.post_clone = cpuset_post_clone,
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3cd..e2ae7349437f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4d ", module_refcount(mod));
+ kdb_printf("%4ld ", module_refcount(mod));
#endif
if (mod->state == MODULE_STATE_GOING)
kdb_printf(" (Unloading)");
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 057e24b665cf..6581a040f399 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -115,8 +115,6 @@ int get_callchain_buffers(void)
}
err = alloc_callchain_buffers();
- if (err)
- release_callchain_buffers();
exit:
mutex_unlock(&callchain_mutex);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3afc68c08433..ba36013cfb21 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -815,7 +815,7 @@ static void update_event_times(struct perf_event *event)
* here.
*/
if (is_cgroup_event(event))
- run_end = perf_event_time(event);
+ run_end = perf_cgroup_event_time(event);
else if (ctx->is_active)
run_end = ctx->time;
else
@@ -2300,6 +2300,9 @@ do { \
return div64_u64(dividend, divisor);
}
+static DEFINE_PER_CPU(int, perf_throttled_count);
+static DEFINE_PER_CPU(u64, perf_throttled_seq);
+
static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
{
struct hw_perf_event *hwc = &event->hw;
@@ -2325,16 +2328,29 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
}
}
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
+ int needs_unthr)
{
struct perf_event *event;
struct hw_perf_event *hwc;
- u64 interrupts, now;
+ u64 now, period = TICK_NSEC;
s64 delta;
- if (!ctx->nr_freq)
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || needs_unthr))
return;
+ raw_spin_lock(&ctx->lock);
+
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
@@ -2344,13 +2360,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
hwc = &event->hw;
- interrupts = hwc->interrupts;
- hwc->interrupts = 0;
-
- /*
- * unthrottle events on the tick
- */
- if (interrupts == MAX_INTERRUPTS) {
+ if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+ hwc->interrupts = 0;
perf_log_throttle(event, 1);
event->pmu->start(event, 0);
}
@@ -2358,14 +2369,26 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
if (!event->attr.freq || !event->attr.sample_freq)
continue;
- event->pmu->read(event);
+ /*
+ * stop the event and update event->count
+ */
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
now = local64_read(&event->count);
delta = now - hwc->freq_count_stamp;
hwc->freq_count_stamp = now;
+ /*
+ * restart the event
+ * reload only if value has changed
+ */
if (delta > 0)
perf_adjust_period(event, period, delta);
+
+ event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
}
+
+ raw_spin_unlock(&ctx->lock);
}
/*
@@ -2388,16 +2411,13 @@ static void rotate_ctx(struct perf_event_context *ctx)
*/
static void perf_rotate_context(struct perf_cpu_context *cpuctx)
{
- u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
struct perf_event_context *ctx = NULL;
- int rotate = 0, remove = 1, freq = 0;
+ int rotate = 0, remove = 1;
if (cpuctx->ctx.nr_events) {
remove = 0;
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
rotate = 1;
- if (cpuctx->ctx.nr_freq)
- freq = 1;
}
ctx = cpuctx->task_ctx;
@@ -2405,37 +2425,26 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
remove = 0;
if (ctx->nr_events != ctx->nr_active)
rotate = 1;
- if (ctx->nr_freq)
- freq = 1;
}
- if (!rotate && !freq)
+ if (!rotate)
goto done;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
- if (freq) {
- perf_ctx_adjust_freq(&cpuctx->ctx, interval);
- if (ctx)
- perf_ctx_adjust_freq(ctx, interval);
- }
-
- if (rotate) {
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- if (ctx)
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (ctx)
+ ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
- rotate_ctx(&cpuctx->ctx);
- if (ctx)
- rotate_ctx(ctx);
+ rotate_ctx(&cpuctx->ctx);
+ if (ctx)
+ rotate_ctx(ctx);
- perf_event_sched_in(cpuctx, ctx, current);
- }
+ perf_event_sched_in(cpuctx, ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-
done:
if (remove)
list_del_init(&cpuctx->rotation_list);
@@ -2445,10 +2454,22 @@ void perf_event_task_tick(void)
{
struct list_head *head = &__get_cpu_var(rotation_list);
struct perf_cpu_context *cpuctx, *tmp;
+ struct perf_event_context *ctx;
+ int throttled;
WARN_ON(!irqs_disabled());
+ __this_cpu_inc(perf_throttled_seq);
+ throttled = __this_cpu_xchg(perf_throttled_count, 0);
+
list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+ ctx = &cpuctx->ctx;
+ perf_adjust_freq_unthr_context(ctx, throttled);
+
+ ctx = cpuctx->task_ctx;
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, throttled);
+
if (cpuctx->jiffies_interval == 1 ||
!(jiffies % cpuctx->jiffies_interval))
perf_rotate_context(cpuctx);
@@ -4509,6 +4530,7 @@ static int __perf_event_overflow(struct perf_event *event,
{
int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
+ u64 seq;
int ret = 0;
/*
@@ -4518,14 +4540,20 @@ static int __perf_event_overflow(struct perf_event *event,
if (unlikely(!is_sampling_event(event)))
return 0;
- if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
- if (throttle) {
+ seq = __this_cpu_read(perf_throttled_seq);
+ if (seq != hwc->interrupts_seq) {
+ hwc->interrupts_seq = seq;
+ hwc->interrupts = 1;
+ } else {
+ hwc->interrupts++;
+ if (unlikely(throttle
+ && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
ret = 1;
}
- } else
- hwc->interrupts++;
+ }
if (event->attr.freq) {
u64 now = perf_clock();
@@ -6941,10 +6969,13 @@ static int __perf_cgroup_move(void *info)
return 0;
}
-static void
-perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
- task_function_call(task, __perf_cgroup_move, task);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset)
+ task_function_call(task, __perf_cgroup_move, task);
}
static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -6958,7 +6989,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
if (!(task->flags & PF_EXITING))
return;
- perf_cgroup_attach_task(cgrp, task);
+ task_function_call(task, __perf_cgroup_move, task);
}
struct cgroup_subsys perf_subsys = {
@@ -6967,6 +6998,6 @@ struct cgroup_subsys perf_subsys = {
.create = perf_cgroup_create,
.destroy = perf_cgroup_destroy,
.exit = perf_cgroup_exit,
- .attach_task = perf_cgroup_attach_task,
+ .attach = perf_cgroup_attach,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/exit.c b/kernel/exit.c
index d9eab2e4b430..4b4042f9bc6a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
+#include <linux/writeback.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -886,7 +887,7 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
@@ -963,8 +964,7 @@ NORET_TYPE void do_exit(long code)
acct_collect(code, group_dead);
if (group_dead)
tty_audit_exit();
- if (unlikely(tsk->audit_context))
- audit_free(tsk);
+ audit_free(tsk);
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
@@ -1035,7 +1035,25 @@ NORET_TYPE void do_exit(long code)
validate_creds_for_do_exit(tsk);
preempt_disable();
+ if (tsk->nr_dirtied)
+ __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
+
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&tsk->pi_lock);
+
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
@@ -1048,7 +1066,7 @@ NORET_TYPE void do_exit(long code)
EXPORT_SYMBOL_GPL(do_exit);
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
@@ -1067,7 +1085,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-NORET_TYPE void
+void
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
diff --git a/kernel/fork.c b/kernel/fork.c
index f34f894c4b98..1b2ef3c23ae4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
@@ -644,6 +647,26 @@ struct mm_struct *get_task_mm(struct task_struct *task)
}
EXPORT_SYMBOL_GPL(get_task_mm);
+struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+ struct mm_struct *mm;
+ int err;
+
+ err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ if (err)
+ return ERR_PTR(err);
+
+ mm = get_task_mm(task);
+ if (mm && mm != current->mm &&
+ !ptrace_may_access(task, mode)) {
+ mmput(mm);
+ mm = ERR_PTR(-EACCES);
+ }
+ mutex_unlock(&task->signal->cred_guard_mutex);
+
+ return mm;
+}
+
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
@@ -870,6 +893,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
#ifdef CONFIG_BLOCK
struct io_context *ioc = current->io_context;
+ struct io_context *new_ioc;
if (!ioc)
return 0;
@@ -881,11 +905,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
if (unlikely(!tsk->io_context))
return -ENOMEM;
} else if (ioprio_valid(ioc->ioprio)) {
- tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
- if (unlikely(!tsk->io_context))
+ new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+ if (unlikely(!new_ioc))
return -ENOMEM;
- tsk->io_context->ioprio = ioc->ioprio;
+ new_ioc->ioprio = ioc->ioprio;
+ put_io_context(new_ioc, NULL);
}
#endif
return 0;
@@ -972,7 +997,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sched_autogroup_fork(sig);
#ifdef CONFIG_CGROUPS
- init_rwsem(&sig->threadgroup_fork_lock);
+ init_rwsem(&sig->group_rwsem);
#endif
sig->oom_adj = current->signal->oom_adj;
@@ -1153,7 +1178,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_lock(current);
+ threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1291,6 +1316,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+ p->dirty_paused_when = 0;
/*
* Ok, make it visible to the rest of the system.
@@ -1368,8 +1394,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_unlock(current);
+ threadgroup_change_end(current);
perf_event_fork(p);
+
+ trace_task_newtask(p, clone_flags);
+
return p;
bad_fork_free_pid:
@@ -1403,7 +1432,7 @@ bad_fork_cleanup_policy:
bad_fork_cleanup_cgroup:
#endif
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_unlock(current);
+ threadgroup_change_end(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
@@ -1518,8 +1547,6 @@ long do_fork(unsigned long clone_flags,
init_completion(&vfork);
}
- audit_finish_fork(p);
-
/*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a73dd6c7372d..b7952316016a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
#define istate core_internal_state__do_not_mess_with_it
-extern int noirqdebug;
+extern bool noirqdebug;
/*
* Bits used by threaded handlers:
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dc813a948be2..611cd6003c45 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
desc->irqs_unhandled = 0;
}
-int noirqdebug __read_mostly;
+bool noirqdebug __read_mostly;
int noirqdebug_setup(char *str)
{
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 30c3c7708132..01d3b70fc98a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -71,6 +71,7 @@ void jump_label_inc(struct jump_label_key *key)
atomic_inc(&key->enabled);
jump_label_unlock();
}
+EXPORT_SYMBOL_GPL(jump_label_inc);
static void __jump_label_dec(struct jump_label_key *key,
unsigned long rate_limit, struct delayed_work *work)
@@ -86,6 +87,7 @@ static void __jump_label_dec(struct jump_label_key *key,
jump_label_unlock();
}
+EXPORT_SYMBOL_GPL(jump_label_dec);
static void jump_label_update_timeout(struct work_struct *work)
{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 090ee10d9604..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
#include <linux/console.h>
#include <linux/vmalloc.h>
#include <linux/swap.h>
-#include <linux/kmsg_dump.h>
#include <linux/syscore_ops.h>
#include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
if (kexec_crash_image) {
struct pt_regs fixed_regs;
- kmsg_dump(KMSG_DUMP_KEXEC);
-
crash_setup_regs(&fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&fixed_regs);
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
{
int ret = 0;
unsigned long start, end;
+ unsigned long old_size;
+ struct resource *ram_res;
mutex_lock(&kexec_mutex);
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)
}
start = crashk_res.start;
end = crashk_res.end;
+ old_size = (end == 0) ? 0 : end - start + 1;
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
- if (new_size >= end - start + 1) {
- ret = -EINVAL;
- if (new_size == end - start + 1)
- ret = 0;
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res) {
+ ret = -ENOMEM;
goto unlock;
}
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)
if ((start == end) && (crashk_res.parent != NULL))
release_resource(&crashk_res);
+
+ ram_res->start = end;
+ ram_res->end = crashk_res.end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ ram_res->name = "System RAM";
+
crashk_res.end = end - 1;
+
+ insert_resource(&iomem_resource, ram_res);
crash_unmap_reserved_pages();
unlock:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823b..29f5b65bee29 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
/* Early boot. kretprobe_table_locks not yet initialized. */
return;
+ INIT_HLIST_HEAD(&empty_rp);
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
kretprobe_table_lock(hash, &flags);
@@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
recycle_rp_inst(ri, &empty_rp);
}
kretprobe_table_unlock(hash, &flags);
- INIT_HLIST_HEAD(&empty_rp);
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[32];
- int buf_size;
+ size_t buf_size;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/module.c b/kernel/module.c
index 178333c48d1e..2c932760fd33 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,12 +62,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt , a...)
-#endif
-
#ifndef ARCH_SHF_SMALL
#define ARCH_SHF_SMALL 0
#endif
@@ -138,7 +132,6 @@ struct load_info {
unsigned long len;
Elf_Shdr *sechdrs;
char *secstrings, *strtab;
- unsigned long *strmap;
unsigned long symoffs, stroffs;
struct _ddebug *debug;
unsigned int num_debug;
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
return fsa.sym;
}
- DEBUGP("Failed to find symbol %s\n", name);
+ pr_debug("Failed to find symbol %s\n", name);
return NULL;
}
EXPORT_SYMBOL_GPL(find_symbol);
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
list_for_each_entry(use, &b->source_list, source_list) {
if (use->source == a) {
- DEBUGP("%s uses %s!\n", a->name, b->name);
+ pr_debug("%s uses %s!\n", a->name, b->name);
return 1;
}
}
- DEBUGP("%s does not use %s!\n", a->name, b->name);
+ pr_debug("%s does not use %s!\n", a->name, b->name);
return 0;
}
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
{
struct module_use *use;
- DEBUGP("Allocating new usage for %s.\n", a->name);
+ pr_debug("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
if (!use) {
printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod)
mutex_lock(&module_mutex);
list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
struct module *i = use->target;
- DEBUGP("%s unusing %s\n", mod->name, i->name);
+ pr_debug("%s unusing %s\n", mod->name, i->name);
module_put(i);
list_del(&use->source_list);
list_del(&use->target_list);
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
}
}
-unsigned int module_refcount(struct module *mod)
+unsigned long module_refcount(struct module *mod)
{
- unsigned int incs = 0, decs = 0;
+ unsigned long incs = 0, decs = 0;
int cpu;
for_each_possible_cpu(cpu)
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
/* Since we might sleep for some time, release the mutex first */
mutex_unlock(&module_mutex);
for (;;) {
- DEBUGP("Looking at refcount...\n");
+ pr_debug("Looking at refcount...\n");
set_current_state(TASK_UNINTERRUPTIBLE);
if (module_refcount(mod) == 0)
break;
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
if (mod->state != MODULE_STATE_LIVE) {
/* FIXME: if (force), slam module count and wake up
waiter --RR */
- DEBUGP("%s already dying\n", mod->name);
+ pr_debug("%s already dying\n", mod->name);
ret = -EBUSY;
goto out;
}
@@ -854,7 +847,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
struct module_use *use;
int printed_something = 0;
- seq_printf(m, " %u ", module_refcount(mod));
+ seq_printf(m, " %lu ", module_refcount(mod));
/* Always include a trailing , so userspace can differentiate
between this and the old multi-field proc format. */
@@ -904,13 +897,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
static ssize_t show_refcnt(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", module_refcount(mk->mod));
+ return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
}
-static struct module_attribute refcnt = {
- .attr = { .name = "refcnt", .mode = 0444 },
- .show = show_refcnt,
-};
+static struct module_attribute modinfo_refcnt =
+ __ATTR(refcnt, 0444, show_refcnt, NULL);
void module_put(struct module *module)
{
@@ -951,6 +942,26 @@ static inline int module_unload_init(struct module *mod)
}
#endif /* CONFIG_MODULE_UNLOAD */
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+ size_t l = 0;
+
+ if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+ buf[l++] = 'P';
+ if (mod->taints & (1 << TAINT_OOT_MODULE))
+ buf[l++] = 'O';
+ if (mod->taints & (1 << TAINT_FORCED_MODULE))
+ buf[l++] = 'F';
+ if (mod->taints & (1 << TAINT_CRAP))
+ buf[l++] = 'C';
+ /*
+ * TAINT_FORCED_RMMOD: could be added.
+ * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+ * apply to modules.
+ */
+ return l;
+}
+
static ssize_t show_initstate(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
return sprintf(buffer, "%s\n", state);
}
-static struct module_attribute initstate = {
- .attr = { .name = "initstate", .mode = 0444 },
- .show = show_initstate,
-};
+static struct module_attribute modinfo_initstate =
+ __ATTR(initstate, 0444, show_initstate, NULL);
static ssize_t store_uevent(struct module_attribute *mattr,
struct module_kobject *mk,
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
return count;
}
-struct module_attribute module_uevent = {
- .attr = { .name = "uevent", .mode = 0200 },
- .store = store_uevent,
-};
+struct module_attribute module_uevent =
+ __ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%u\n", mk->mod->core_size);
+}
+
+static struct module_attribute modinfo_coresize =
+ __ATTR(coresize, 0444, show_coresize, NULL);
+
+static ssize_t show_initsize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%u\n", mk->mod->init_size);
+}
+
+static struct module_attribute modinfo_initsize =
+ __ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ size_t l;
+
+ l = module_flags_taint(mk->mod, buffer);
+ buffer[l++] = '\n';
+ return l;
+}
+
+static struct module_attribute modinfo_taint =
+ __ATTR(taint, 0444, show_taint, NULL);
static struct module_attribute *modinfo_attrs[] = {
+ &module_uevent,
&modinfo_version,
&modinfo_srcversion,
- &initstate,
- &module_uevent,
+ &modinfo_initstate,
+ &modinfo_coresize,
+ &modinfo_initsize,
+ &modinfo_taint,
#ifdef CONFIG_MODULE_UNLOAD
- &refcnt,
+ &modinfo_refcnt,
#endif
NULL,
};
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs,
if (versions[i].crc == maybe_relocated(*crc, crc_owner))
return 1;
- DEBUGP("Found checksum %lX vs module %lX\n",
+ pr_debug("Found checksum %lX vs module %lX\n",
maybe_relocated(*crc, crc_owner), versions[i].crc);
goto bad_version;
}
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
case SHN_COMMON:
/* We compiled with -fno-common. These are not
supposed to happen. */
- DEBUGP("Common symbol: %s\n", name);
+ pr_debug("Common symbol: %s\n", name);
printk("%s: please compile with -fno-common\n",
mod->name);
ret = -ENOEXEC;
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
case SHN_ABS:
/* Don't need to do anything */
- DEBUGP("Absolute symbol: 0x%08lx\n",
+ pr_debug("Absolute symbol: 0x%08lx\n",
(long)sym[i].st_value);
break;
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
for (i = 0; i < info->hdr->e_shnum; i++)
info->sechdrs[i].sh_entsize = ~0UL;
- DEBUGP("Core section allocation order:\n");
+ pr_debug("Core section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
for (i = 0; i < info->hdr->e_shnum; ++i) {
Elf_Shdr *s = &info->sechdrs[i];
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| strstarts(sname, ".init"))
continue;
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
- DEBUGP("\t%s\n", name);
+ pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
}
}
- DEBUGP("Init section allocation order:\n");
+ pr_debug("Init section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
for (i = 0; i < info->hdr->e_shnum; ++i) {
Elf_Shdr *s = &info->sechdrs[i];
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
continue;
s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
| INIT_OFFSET_MASK);
- DEBUGP("\t%s\n", sname);
+ pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
return true;
}
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep. This is simple, but has the effect of making multiple
+ * copies of duplicates. We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
static void layout_symtab(struct module *mod, struct load_info *info)
{
Elf_Shdr *symsect = info->sechdrs + info->index.sym;
Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
- unsigned int i, nsrc, ndst;
+ unsigned int i, nsrc, ndst, strtab_size;
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
info->index.sym) | INIT_OFFSET_MASK;
- DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
+ pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
src = (void *)info->hdr + symsect->sh_offset;
nsrc = symsect->sh_size / sizeof(*src);
- for (ndst = i = 1; i < nsrc; ++i, ++src)
- if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
- unsigned int j = src->st_name;
- while (!__test_and_set_bit(j, info->strmap)
- && info->strtab[j])
- ++j;
- ++ndst;
+ /* Compute total space required for the core symbols' strtab. */
+ for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
+ if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
+ strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+ ndst++;
}
/* Append room for core symbols at end of core part. */
info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+ info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod->core_size += strtab_size;
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
info->index.str) | INIT_OFFSET_MASK;
- DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
-
- /* Append room for core symbols' strings at end of core part. */
- info->stroffs = mod->core_size;
- __set_bit(0, info->strmap);
- mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
+ pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
}
static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
mod->core_symtab = dst = mod->module_core + info->symoffs;
+ mod->core_strtab = s = mod->module_core + info->stroffs;
src = mod->symtab;
*dst = *src;
+ *s++ = 0;
for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
continue;
+
dst[ndst] = *src;
- dst[ndst].st_name = bitmap_weight(info->strmap,
- dst[ndst].st_name);
- ++ndst;
+ dst[ndst++].st_name = s - mod->core_strtab;
+ s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
}
mod->core_num_syms = ndst;
-
- mod->core_strtab = s = mod->module_core + info->stroffs;
- for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
- if (test_bit(i, info->strmap))
- *++s = mod->strtab[i];
}
#else
static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2621,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info)
mod->module_init = ptr;
/* Transfer each section which specifies SHF_ALLOC */
- DEBUGP("final section addresses:\n");
+ pr_debug("final section addresses:\n");
for (i = 0; i < info->hdr->e_shnum; i++) {
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2639,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info)
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
/* Update sh_addr to point to copy in image. */
shdr->sh_addr = (unsigned long)dest;
- DEBUGP("\t0x%lx %s\n",
- shdr->sh_addr, info->secstrings + shdr->sh_name);
+ pr_debug("\t0x%lx %s\n",
+ (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
}
return 0;
@@ -2742,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info)
this is done generically; there doesn't appear to be any
special cases for the architectures. */
layout_sections(mod, info);
-
- info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
- * sizeof(long), GFP_KERNEL);
- if (!info->strmap) {
- err = -ENOMEM;
- goto free_percpu;
- }
layout_symtab(mod, info);
/* Allocate and move to the final place */
err = move_module(mod, info);
if (err)
- goto free_strmap;
+ goto free_percpu;
/* Module has been copied to its final place now: return it. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
kmemleak_load_module(mod, info);
return mod;
-free_strmap:
- kfree(info->strmap);
free_percpu:
percpu_modfree(mod);
out:
@@ -2772,7 +2802,6 @@ out:
/* mod is no longer valid after this! */
static void module_deallocate(struct module *mod, struct load_info *info)
{
- kfree(info->strmap);
percpu_modfree(mod);
module_free(mod, mod->module_init);
module_free(mod, mod->module_core);
@@ -2811,7 +2840,7 @@ static struct module *load_module(void __user *umod,
struct module *mod;
long err;
- DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+ pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
umod, len, uargs);
/* Copy in the blobs from userspace, check they are vaguely sane. */
@@ -2902,8 +2931,7 @@ static struct module *load_module(void __user *umod,
if (err < 0)
goto unlink;
- /* Get rid of temporary copy and strmap. */
- kfree(info.strmap);
+ /* Get rid of temporary copy. */
free_copy(&info);
/* Done! */
@@ -3256,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
buf[bx++] = '(';
- if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
- buf[bx++] = 'P';
- else if (mod->taints & (1 << TAINT_OOT_MODULE))
- buf[bx++] = 'O';
- if (mod->taints & (1 << TAINT_FORCED_MODULE))
- buf[bx++] = 'F';
- if (mod->taints & (1 << TAINT_CRAP))
- buf[bx++] = 'C';
- /*
- * TAINT_FORCED_RMMOD: could be added.
- * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
- * apply to modules.
- */
-
+ bx += module_flags_taint(mod, buf + bx);
/* Show a - for module-is-being-unloaded */
if (mod->state == MODULE_STATE_GOING)
buf[bx++] = '-';
diff --git a/kernel/panic.c b/kernel/panic.c
index 3458469eb7c3..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
long (*panic_blink)(int state);
EXPORT_SYMBOL(panic_blink);
+/*
+ * Stop ourself in panic -- architecture code may override this
+ */
+void __weak panic_smp_self_stop(void)
+{
+ while (1)
+ cpu_relax();
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
*
* This function never returns.
*/
-NORET_TYPE void panic(const char * fmt, ...)
+void panic(const char *fmt, ...)
{
+ static DEFINE_SPINLOCK(panic_lock);
static char buf[1024];
va_list args;
long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
+ *
+ * Only one CPU is allowed to execute the panic code from here. For
+ * multiple parallel invocations of panic, all other CPUs either
+ * stop themself or will wait until they are stopped by the 1st CPU
+ * with smp_send_stop().
*/
- preempt_disable();
+ if (!spin_trylock(&panic_lock))
+ panic_smp_self_stop();
console_verbose();
bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
va_end(args);
printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
- dump_stack();
+ /*
+ * Avoid nested stack-dumping if a panic occurs during oops processing
+ */
+ if (!oops_in_progress)
+ dump_stack();
#endif
/*
diff --git a/kernel/params.c b/kernel/params.c
index 65aae11eb93f..32ee04308285 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,6 @@
#include <linux/slab.h>
#include <linux/ctype.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt, a...)
-#endif
-
/* Protects all parameters, and incidentally kmalloced_param list. */
static DEFINE_MUTEX(param_lock);
@@ -105,7 +99,7 @@ static int parse_one(char *param,
/* No one handled NULL, so do it here. */
if (!val && params[i].ops->set != param_set_bool)
return -EINVAL;
- DEBUGP("They are equal! Calling %p\n",
+ pr_debug("They are equal! Calling %p\n",
params[i].ops->set);
mutex_lock(&param_lock);
err = params[i].ops->set(val, &params[i]);
@@ -115,11 +109,11 @@ static int parse_one(char *param,
}
if (handle_unknown) {
- DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+ pr_debug("Unknown argument: calling %p\n", handle_unknown);
return handle_unknown(param, val);
}
- DEBUGP("Unknown argument `%s'\n", param);
+ pr_debug("Unknown argument `%s'\n", param);
return -ENOENT;
}
@@ -184,7 +178,7 @@ int parse_args(const char *name,
{
char *param, *val;
- DEBUGP("Parsing ARGS: %s\n", args);
+ pr_debug("Parsing ARGS: %s\n", args);
/* Chew leading spaces */
args = skip_spaces(args);
@@ -369,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
};
EXPORT_SYMBOL(param_ops_invbool);
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+ struct kernel_param boolkp;
+ bool v;
+ int ret;
+
+ /* Match bool exactly, by re-using it. */
+ boolkp = *kp;
+ boolkp.arg = &v;
+ boolkp.flags |= KPARAM_ISBOOL;
+
+ ret = param_set_bool(val, &boolkp);
+ if (ret == 0)
+ *(int *)kp->arg = v;
+ return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+
+struct kernel_param_ops param_ops_bint = {
+ .set = param_set_bint,
+ .get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
+
/* We break the rule and mangle the string. */
static int param_array(const char *name,
const char *val,
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5f..ce8e00deaccb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
}
/*
- * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We might be racing with someone else trying to set pid_ns->last_pid
+ * at the pid allocation time (there's also a sysctl for this, but racing
+ * with this one is OK, see comment in kernel/pid_namespace.c about it).
* We want the winner to have the "later" value, because if the
* "earlier" value prevails, then a pid may get reused immediately.
*
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
return;
}
+static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Writing directly to ns' last_pid field is OK, since this field
+ * is volatile in a living namespace anyway and a code writing to
+ * it should synchronize its usage with external means.
+ */
+
+ tmp.data = &current->nsproxy->pid_ns->last_pid;
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table[] = {
+ {
+ .procname = "ns_last_pid",
+ .maxlen = sizeof(int),
+ .mode = 0666, /* permissions are checked in the handler */
+ .proc_handler = pid_ns_ctl_handler,
+ },
+ { }
+};
+
+static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ register_sysctl_paths(kern_path, pid_ns_ctl_table);
return 0;
}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 77274c9ba2f1..eeca00311f39 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -188,3 +188,22 @@ void thaw_processes(void)
printk("done.\n");
}
+void thaw_kernel_threads(void)
+{
+ struct task_struct *g, *p;
+
+ pm_nosig_freezing = false;
+ printk("Restarting kernel threads ... ");
+
+ thaw_workqueues();
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
+ __thaw_task(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+
+ schedule();
+ printk("done.\n");
+}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cbe2c1441392..6a768e537001 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone)
unsigned int res;
res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
- res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
+ res += DIV_ROUND_UP(res * sizeof(struct bm_block),
+ LINKED_PAGE_DATA_SIZE);
return 2 * res;
}
@@ -858,6 +859,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
PageReserved(page))
return NULL;
+ if (page_is_guard(page))
+ return NULL;
+
return page;
}
@@ -920,6 +924,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
+ if (page_is_guard(page))
+ return NULL;
+
return page;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3739ecced085..8742fd013a94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -773,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
pr_debug("PM: Free swap pages: %u\n", free_swap);
- required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
- nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
+ required = PAGES_FOR_IO + nr_pages;
return free_swap > required;
}
@@ -802,10 +801,12 @@ int swsusp_write(unsigned int flags)
printk(KERN_ERR "PM: Cannot get swap writer\n");
return error;
}
- if (!enough_swap(pages, flags)) {
- printk(KERN_ERR "PM: Not enough free swap\n");
- error = -ENOSPC;
- goto out_finish;
+ if (flags & SF_NOCOMPRESS_MODE) {
+ if (!enough_swap(pages, flags)) {
+ printk(KERN_ERR "PM: Not enough free swap\n");
+ error = -ENOSPC;
+ goto out_finish;
+ }
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_read_next(&snapshot);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6b1ab7a88522..e5a21a857302 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -274,6 +274,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
swsusp_free();
memset(&data->handle, 0, sizeof(struct snapshot_handle));
data->ready = 0;
+ /*
+ * It is necessary to thaw kernel threads here, because
+ * SNAPSHOT_CREATE_IMAGE may be invoked directly after
+ * SNAPSHOT_FREE. In that case, if kernel threads were not
+ * thawed, the preallocation of memory carried out by
+ * hibernation_snapshot() might run into problems (i.e. it
+ * might fail or even deadlock).
+ */
+ thaw_kernel_threads();
break;
case SNAPSHOT_PREF_IMAGE_SIZE:
diff --git a/kernel/printk.c b/kernel/printk.c
index 989e4a52da76..13c0a1143f49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
}
}
-static int __read_mostly ignore_loglevel;
+static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str)
}
early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
"print all kernel messages to the console.");
@@ -696,9 +696,9 @@ static void zap_locks(void)
}
#if defined(CONFIG_PRINTK_TIME)
-static int printk_time = 1;
+static bool printk_time = 1;
#else
-static int printk_time = 0;
+static bool printk_time = 0;
#endif
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
@@ -1098,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
return -1;
}
-int console_suspend_enabled = 1;
+bool console_suspend_enabled = 1;
EXPORT_SYMBOL(console_suspend_enabled);
static int __init console_suspend_disable(char *str)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 78ab24a7b0e4..00ab2ca5ed11 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -172,6 +172,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
+static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+{
+ if (mode & PTRACE_MODE_NOAUDIT)
+ return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ else
+ return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+}
+
int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
@@ -198,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
cred->gid == tcred->sgid &&
cred->gid == tcred->gid))
goto ok;
- if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+ if (ptrace_has_cap(tcred->user->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -207,7 +215,7 @@ ok:
smp_rmb();
if (task->mm)
dumpable = get_dumpable(task->mm);
- if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
+ if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode))
return -EPERM;
return security_ptrace_access_check(task, mode);
@@ -277,7 +285,7 @@ static int ptrace_attach(struct task_struct *task, long request,
task->ptrace = PT_PTRACED;
if (seize)
task->ptrace |= PT_SEIZED;
- if (task_ns_capable(task, CAP_SYS_PTRACE))
+ if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 88f17b8a3b1d..a58ac285fc69 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -56,8 +56,8 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
static int nfakewriters = 4; /* # fake writer threads */
static int stat_interval; /* Interval between stats, in seconds. */
/* Defaults to "only at end of test". */
-static int verbose; /* Print more debug info. */
-static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
+static bool verbose; /* Print more debug info. */
+static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
static int stutter = 5; /* Start/stop testing interval (in sec) */
static int irqreader = 1; /* RCU readers from irq (timers). */
@@ -1399,7 +1399,7 @@ rcu_torture_shutdown(void *arg)
* Execute random CPU-hotplug operations at the interval specified
* by the onoff_interval.
*/
-static int
+static int __cpuinit
rcu_torture_onoff(void *arg)
{
int cpu;
@@ -1447,7 +1447,7 @@ rcu_torture_onoff(void *arg)
return 0;
}
-static int
+static int __cpuinit
rcu_torture_onoff_init(void)
{
if (onoff_interval <= 0)
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cce..d508363858b3 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -66,6 +66,31 @@ done:
return ret;
}
+int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
+ struct res_counter **limit_fail_at)
+{
+ int ret, r;
+ unsigned long flags;
+ struct res_counter *c;
+
+ r = ret = 0;
+ *limit_fail_at = NULL;
+ local_irq_save(flags);
+ for (c = counter; c != NULL; c = c->parent) {
+ spin_lock(&c->lock);
+ r = res_counter_charge_locked(c, val);
+ if (r)
+ c->usage += val;
+ spin_unlock(&c->lock);
+ if (r < 0 && ret == 0) {
+ *limit_fail_at = c;
+ ret = r;
+ }
+ }
+ local_irq_restore(flags);
+
+ return ret;
+}
void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
{
if (WARN_ON(counter->usage < val))
@@ -159,8 +184,7 @@ int res_counter_memparse_write_strategy(const char *buf,
return 0;
}
- /* FIXME - make memparse() take const char* args */
- *res = memparse((char *)buf, &end);
+ *res = memparse(buf, &end);
if (*end != '\0')
return -EINVAL;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0ac0f811d623..5255c9d2e053 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include <asm/mutex.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
@@ -723,9 +724,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
p->sched_class->dequeue_task(rq, p, flags);
}
-/*
- * activate_task - move a task to the runqueue.
- */
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
@@ -734,9 +732,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
enqueue_task(rq, p, flags);
}
-/*
- * deactivate_task - remove a task from the runqueue.
- */
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
@@ -4134,7 +4129,7 @@ recheck:
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
- deactivate_task(rq, p, 0);
+ dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -4147,7 +4142,7 @@ recheck:
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
- activate_task(rq, p, 0);
+ enqueue_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -4330,7 +4325,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+ if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p);
@@ -4998,9 +4993,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* placed properly.
*/
if (p->on_rq) {
- deactivate_task(rq_src, p, 0);
+ dequeue_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
- activate_task(rq_dest, p, 0);
+ enqueue_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p, 0);
}
done:
@@ -7032,10 +7027,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
on_rq = p->on_rq;
if (on_rq)
- deactivate_task(rq, p, 0);
+ dequeue_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
if (on_rq) {
- activate_task(rq, p, 0);
+ enqueue_task(rq, p, 0);
resched_task(rq->curr);
}
@@ -7134,10 +7129,6 @@ void set_curr_task(int cpu, struct task_struct *p)
#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-#else /* !CONFIG_RT_GROUP_SCHED */
-#endif /* CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
@@ -7246,9 +7237,6 @@ void sched_move_task(struct task_struct *tsk)
}
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#endif
-
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
static unsigned long to_ratio(u64 period, u64 runtime)
{
@@ -7563,24 +7551,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
sched_destroy_group(tg);
}
-static int
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
- return -EINVAL;
+ if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ return -EINVAL;
#else
- /* We don't support RT-tasks being in separate groups */
- if (tsk->sched_class != &fair_sched_class)
- return -EINVAL;
+ /* We don't support RT-tasks being in separate groups */
+ if (task->sched_class != &fair_sched_class)
+ return -EINVAL;
#endif
+ }
return 0;
}
-static void
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
- sched_move_task(tsk);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset)
+ sched_move_task(task);
}
static void
@@ -7915,8 +7910,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach_task = cpu_cgroup_can_attach_task,
- .attach_task = cpu_cgroup_attach_task,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index b0d798eaf130..d72586fdf660 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
* cpupri_set - update the cpu priority setting
* @cp: The cpupri context
* @cpu: The target cpu
- * @pri: The priority (INVALID-RT99) to assign to this CPU
+ * @newpri: The priority (INVALID-RT99) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
@@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
/**
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
- * @bootmem: true if allocations need to use bootmem
*
* Returns: -ENOMEM if memory fails.
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e42de9105f8..7c6414fc669d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3130,8 +3130,10 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
}
#define LBF_ALL_PINNED 0x01
-#define LBF_NEED_BREAK 0x02
-#define LBF_ABORT 0x04
+#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
+#define LBF_HAD_BREAK 0x04
+#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT 0x10
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
@@ -4508,7 +4510,9 @@ redo:
goto out_balanced;
if (lb_flags & LBF_NEED_BREAK) {
- lb_flags &= ~LBF_NEED_BREAK;
+ lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+ if (lb_flags & LBF_ABORT)
+ goto out_balanced;
goto redo;
}
@@ -4862,6 +4866,15 @@ static void nohz_balancer_kick(int cpu)
return;
}
+static inline void clear_nohz_tick_stopped(int cpu)
+{
+ if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+ }
+}
+
static inline void set_cpu_sd_state_busy(void)
{
struct sched_domain *sd;
@@ -4900,6 +4913,12 @@ void select_nohz_load_balancer(int stop_tick)
{
int cpu = smp_processor_id();
+ /*
+ * If this cpu is going down, then nothing needs to be done.
+ */
+ if (!cpu_active(cpu))
+ return;
+
if (stop_tick) {
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
@@ -4910,6 +4929,18 @@ void select_nohz_load_balancer(int stop_tick)
}
return;
}
+
+static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DYING:
+ clear_nohz_tick_stopped(smp_processor_id());
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
#endif
static DEFINE_SPINLOCK(balancing);
@@ -5066,11 +5097,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
* busy tick after returning from idle, we will update the busy stats.
*/
set_cpu_sd_state_busy();
- if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
+ clear_nohz_tick_stopped(cpu);
/*
* None are in tickless mode and hence no need for NOHZ idle load
@@ -5586,6 +5613,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ cpu_notifier(sched_ilb_notifier, 0);
#endif
#endif /* SMP */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3640ebbb466b..f42ae7fb5ec5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1587,6 +1587,11 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ if (unlikely(task_running(rq, next_task)))
+ return 0;
+#endif
+
retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13b631d..e8d76c5895ea 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,6 +6,7 @@
* This defines a simple but solid secure-computing mode.
*/
+#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/sched.h>
#include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
+ audit_seccomp(this_syscall);
do_exit(SIGKILL);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 56ce3a618b28..c73c4284160e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,6 +28,7 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+ struct user_namespace *ns)
+{
+ return user_ns_map_uid(ns, cred, cred->uid);
+}
+
+#ifdef CONFIG_USER_NS
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+ if (current_user_ns() == task_cred_xxx(t, user_ns))
+ return;
+
+ if (SI_FROMKERNEL(info))
+ return;
+
+ info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+ current_cred(), info->si_uid);
+}
+#else
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+ return;
+}
+#endif
+
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group, int from_ancestor_ns)
{
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
q->info.si_pid = 0;
break;
}
+
+ userns_fixup_signal_uid(&q->info, t);
+
} else if (!is_si_special(info)) {
if (sig >= SIGRTMIN && info->si_code != SI_USER) {
/*
@@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
- info.si_uid = __task_cred(tsk)->uid;
+ info.si_uid = map_cred_ns(__task_cred(tsk),
+ task_cred_xxx(tsk->parent, user_ns));
rcu_read_unlock();
info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
@@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
- info.si_uid = __task_cred(tsk)->uid;
+ info.si_uid = map_cred_ns(__task_cred(tsk),
+ task_cred_xxx(parent, user_ns));
rcu_read_unlock();
info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
info->si_signo = signr;
info->si_errno = 0;
info->si_code = SI_USER;
+ rcu_read_lock();
info->si_pid = task_pid_vnr(current->parent);
- info->si_uid = task_uid(current->parent);
+ info->si_uid = map_cred_ns(__task_cred(current->parent),
+ current_user_ns());
+ rcu_read_unlock();
}
/* If the (new) signal is now blocked, requeue it. */
@@ -2318,6 +2355,27 @@ relock:
return signr;
}
+/**
+ * block_sigmask - add @ka's signal mask to current->blocked
+ * @ka: action for @signr
+ * @signr: signal that has been successfully delivered
+ *
+ * This function should be called when a signal has succesfully been
+ * delivered. It adds the mask of signals for @ka to current->blocked
+ * so that they are blocked during the execution of the signal
+ * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * set in @ka->sa.sa_flags.
+ */
+void block_sigmask(struct k_sigaction *ka, int signr)
+{
+ sigset_t blocked;
+
+ sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+ if (!(ka->sa.sa_flags & SA_NODEFER))
+ sigaddset(&blocked, signr);
+ set_current_blocked(&blocked);
+}
+
/*
* It could be that complete_signal() picked us to notify about the
* group-wide signal. Other threads should be notified now to take
@@ -2355,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)
int group_stop = 0;
sigset_t unblocked;
+ /*
+ * @tsk is about to have PF_EXITING set - lock out users which
+ * expect stable threadgroup.
+ */
+ threadgroup_change_begin(tsk);
+
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
+ threadgroup_change_end(tsk);
return;
}
@@ -2366,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)
* see wants_signal(), do_signal_stop().
*/
tsk->flags |= PF_EXITING;
+
+ threadgroup_change_end(tsk);
+
if (!signal_pending(tsk))
goto out;
diff --git a/kernel/sys.c b/kernel/sys.c
index ddf8155bf3f8..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm(int opt, unsigned long addr,
+ unsigned long arg4, unsigned long arg5)
+{
+ unsigned long rlim = rlimit(RLIMIT_DATA);
+ unsigned long vm_req_flags;
+ unsigned long vm_bad_flags;
+ struct vm_area_struct *vma;
+ int error = 0;
+ struct mm_struct *mm = current->mm;
+
+ if (arg4 | arg5)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (addr >= TASK_SIZE)
+ return -EINVAL;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, addr);
+
+ if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
+ /* It must be existing VMA */
+ if (!vma || vma->vm_start > addr)
+ goto out;
+ }
+
+ error = -EINVAL;
+ switch (opt) {
+ case PR_SET_MM_START_CODE:
+ case PR_SET_MM_END_CODE:
+ vm_req_flags = VM_READ | VM_EXEC;
+ vm_bad_flags = VM_WRITE | VM_MAYSHARE;
+
+ if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+ (vma->vm_flags & vm_bad_flags))
+ goto out;
+
+ if (opt == PR_SET_MM_START_CODE)
+ mm->start_code = addr;
+ else
+ mm->end_code = addr;
+ break;
+
+ case PR_SET_MM_START_DATA:
+ case PR_SET_MM_END_DATA:
+ vm_req_flags = VM_READ | VM_WRITE;
+ vm_bad_flags = VM_EXEC | VM_MAYSHARE;
+
+ if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+ (vma->vm_flags & vm_bad_flags))
+ goto out;
+
+ if (opt == PR_SET_MM_START_DATA)
+ mm->start_data = addr;
+ else
+ mm->end_data = addr;
+ break;
+
+ case PR_SET_MM_START_STACK:
+
+#ifdef CONFIG_STACK_GROWSUP
+ vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
+#else
+ vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
+#endif
+ if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
+ goto out;
+
+ mm->start_stack = addr;
+ break;
+
+ case PR_SET_MM_START_BRK:
+ if (addr <= mm->end_data)
+ goto out;
+
+ if (rlim < RLIM_INFINITY &&
+ (mm->brk - addr) +
+ (mm->end_data - mm->start_data) > rlim)
+ goto out;
+
+ mm->start_brk = addr;
+ break;
+
+ case PR_SET_MM_BRK:
+ if (addr <= mm->end_data)
+ goto out;
+
+ if (rlim < RLIM_INFINITY &&
+ (addr - mm->start_brk) +
+ (mm->end_data - mm->start_data) > rlim)
+ goto out;
+
+ mm->brk = addr;
+ break;
+
+ default:
+ error = -EINVAL;
+ goto out;
+ }
+
+ error = 0;
+
+out:
+ up_read(&mm->mmap_sem);
+
+ return error;
+}
+#else /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_mm(int opt, unsigned long addr,
+ unsigned long arg4, unsigned long arg5)
+{
+ return -EINVAL;
+}
+#endif
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
else
error = PR_MCE_KILL_DEFAULT;
break;
+ case PR_SET_MM:
+ error = prctl_set_mm(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ {
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
{
.procname = "bootloader_type",
.data = &bootloader_type,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1e8943fed1d..683d559a0eef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,11 +22,13 @@
#include <linux/hardirq.h>
#include <linux/kthread.h>
#include <linux/uaccess.h>
+#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/ftrace.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
#include <linux/ctype.h>
+#include <linux/sort.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/rcupdate.h>
@@ -947,13 +949,6 @@ struct ftrace_func_probe {
struct rcu_head rcu;
};
-enum {
- FTRACE_ENABLE_CALLS = (1 << 0),
- FTRACE_DISABLE_CALLS = (1 << 1),
- FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
- FTRACE_START_FUNC_RET = (1 << 3),
- FTRACE_STOP_FUNC_RET = (1 << 4),
-};
struct ftrace_func_entry {
struct hlist_node hlist;
unsigned long ip;
@@ -984,18 +979,19 @@ static struct ftrace_ops global_ops = {
.filter_hash = EMPTY_HASH,
};
-static struct dyn_ftrace *ftrace_new_addrs;
-
static DEFINE_MUTEX(ftrace_regex_lock);
struct ftrace_page {
struct ftrace_page *next;
+ struct dyn_ftrace *records;
int index;
- struct dyn_ftrace records[];
+ int size;
};
-#define ENTRIES_PER_PAGE \
- ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+static struct ftrace_page *ftrace_new_pgs;
+
+#define ENTRY_SIZE sizeof(struct dyn_ftrace)
+#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
/* estimate from running different kernels */
#define NR_TO_INIT 10000
@@ -1003,7 +999,10 @@ struct ftrace_page {
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
-static struct dyn_ftrace *ftrace_free_records;
+static bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+ return !hash || !hash->count;
+}
static struct ftrace_func_entry *
ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1013,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
struct hlist_head *hhd;
struct hlist_node *n;
- if (!hash->count)
+ if (ftrace_hash_empty(hash))
return NULL;
if (hash->size_bits > 0)
@@ -1157,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
return NULL;
/* Empty hash? */
- if (!hash || !hash->count)
+ if (ftrace_hash_empty(hash))
return new_hash;
size = 1 << hash->size_bits;
@@ -1282,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
filter_hash = rcu_dereference_raw(ops->filter_hash);
notrace_hash = rcu_dereference_raw(ops->notrace_hash);
- if ((!filter_hash || !filter_hash->count ||
+ if ((ftrace_hash_empty(filter_hash) ||
ftrace_lookup_ip(filter_hash, ip)) &&
- (!notrace_hash || !notrace_hash->count ||
+ (ftrace_hash_empty(notrace_hash) ||
!ftrace_lookup_ip(notrace_hash, ip)))
ret = 1;
else
@@ -1307,6 +1306,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
} \
}
+
+static int ftrace_cmp_recs(const void *a, const void *b)
+{
+ const struct dyn_ftrace *reca = a;
+ const struct dyn_ftrace *recb = b;
+
+ if (reca->ip > recb->ip)
+ return 1;
+ if (reca->ip < recb->ip)
+ return -1;
+ return 0;
+}
+
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns 1 if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_location(unsigned long ip)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ struct dyn_ftrace key;
+
+ key.ip = ip;
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ rec = bsearch(&key, pg->records, pg->index,
+ sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs);
+ if (rec)
+ return 1;
+ }
+
+ return 0;
+}
+
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1336,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (filter_hash) {
hash = ops->filter_hash;
other_hash = ops->notrace_hash;
- if (!hash || !hash->count)
+ if (ftrace_hash_empty(hash))
all = 1;
} else {
inc = !inc;
@@ -1346,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
* If the notrace hash has no items,
* then there's nothing to do.
*/
- if (hash && !hash->count)
+ if (ftrace_hash_empty(hash))
return;
}
@@ -1363,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
match = 1;
} else {
- in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
- in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
+ in_hash = !!ftrace_lookup_ip(hash, rec->ip);
+ in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
/*
*
@@ -1372,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (filter_hash && in_hash && !in_other_hash)
match = 1;
else if (!filter_hash && in_hash &&
- (in_other_hash || !other_hash->count))
+ (in_other_hash || ftrace_hash_empty(other_hash)))
match = 1;
}
if (!match)
@@ -1406,40 +1446,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
__ftrace_hash_rec_update(ops, filter_hash, 1);
}
-static void ftrace_free_rec(struct dyn_ftrace *rec)
-{
- rec->freelist = ftrace_free_records;
- ftrace_free_records = rec;
- rec->flags |= FTRACE_FL_FREE;
-}
-
static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
{
- struct dyn_ftrace *rec;
-
- /* First check for freed records */
- if (ftrace_free_records) {
- rec = ftrace_free_records;
-
- if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
- FTRACE_WARN_ON_ONCE(1);
- ftrace_free_records = NULL;
+ if (ftrace_pages->index == ftrace_pages->size) {
+ /* We should have allocated enough */
+ if (WARN_ON(!ftrace_pages->next))
return NULL;
- }
-
- ftrace_free_records = rec->freelist;
- memset(rec, 0, sizeof(*rec));
- return rec;
- }
-
- if (ftrace_pages->index == ENTRIES_PER_PAGE) {
- if (!ftrace_pages->next) {
- /* allocate another page */
- ftrace_pages->next =
- (void *)get_zeroed_page(GFP_KERNEL);
- if (!ftrace_pages->next)
- return NULL;
- }
ftrace_pages = ftrace_pages->next;
}
@@ -1459,8 +1471,6 @@ ftrace_record_ip(unsigned long ip)
return NULL;
rec->ip = ip;
- rec->newlist = ftrace_new_addrs;
- ftrace_new_addrs = rec;
return rec;
}
@@ -1475,7 +1485,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
}
-static void ftrace_bug(int failed, unsigned long ip)
+/**
+ * ftrace_bug - report and shutdown function tracer
+ * @failed: The failed type (EFAULT, EINVAL, EPERM)
+ * @ip: The address that failed
+ *
+ * The arch code that enables or disables the function tracing
+ * can call ftrace_bug() when it has detected a problem in
+ * modifying the code. @failed should be one of either:
+ * EFAULT - if the problem happens on reading the @ip address
+ * EINVAL - if what is read at @ip is not what was expected
+ * EPERM - if the problem happens on writting to the @ip address
+ */
+void ftrace_bug(int failed, unsigned long ip)
{
switch (failed) {
case -EFAULT:
@@ -1517,24 +1539,19 @@ int ftrace_text_reserved(void *start, void *end)
return 0;
}
-
-static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
- unsigned long ftrace_addr;
unsigned long flag = 0UL;
- ftrace_addr = (unsigned long)FTRACE_ADDR;
-
/*
- * If we are enabling tracing:
+ * If we are updating calls:
*
* If the record has a ref count, then we need to enable it
* because someone is using it.
*
* Otherwise we make sure its disabled.
*
- * If we are disabling tracing, then disable all records that
+ * If we are disabling calls, then disable all records that
* are enabled.
*/
if (enable && (rec->flags & ~FTRACE_FL_MASK))
@@ -1542,18 +1559,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
- return 0;
+ return FTRACE_UPDATE_IGNORE;
if (flag) {
- rec->flags |= FTRACE_FL_ENABLED;
+ if (update)
+ rec->flags |= FTRACE_FL_ENABLED;
+ return FTRACE_UPDATE_MAKE_CALL;
+ }
+
+ if (update)
+ rec->flags &= ~FTRACE_FL_ENABLED;
+
+ return FTRACE_UPDATE_MAKE_NOP;
+}
+
+/**
+ * ftrace_update_record, set a record that now is tracing or not
+ * @rec: the record to update
+ * @enable: set to 1 if the record is tracing, zero to force disable
+ *
+ * The records that represent all functions that can be traced need
+ * to be updated when tracing has been enabled.
+ */
+int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+{
+ return ftrace_check_record(rec, enable, 1);
+}
+
+/**
+ * ftrace_test_record, check if the record has been enabled or not
+ * @rec: the record to test
+ * @enable: set to 1 to check if enabled, 0 if it is disabled
+ *
+ * The arch code may need to test if a record is already set to
+ * tracing to determine how to modify the function code that it
+ * represents.
+ */
+int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+{
+ return ftrace_check_record(rec, enable, 0);
+}
+
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ ret = ftrace_update_record(rec, enable);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
return ftrace_make_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ return ftrace_make_nop(NULL, rec, ftrace_addr);
}
- rec->flags &= ~FTRACE_FL_ENABLED;
- return ftrace_make_nop(NULL, rec, ftrace_addr);
+ return -1; /* unknow ftrace bug */
}
-static void ftrace_replace_code(int enable)
+static void ftrace_replace_code(int update)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
@@ -1563,11 +1634,7 @@ static void ftrace_replace_code(int enable)
return;
do_for_each_ftrace_rec(pg, rec) {
- /* Skip over free records */
- if (rec->flags & FTRACE_FL_FREE)
- continue;
-
- failed = __ftrace_replace_code(rec, enable);
+ failed = __ftrace_replace_code(rec, update);
if (failed) {
ftrace_bug(failed, rec->ip);
/* Stop processing */
@@ -1576,6 +1643,78 @@ static void ftrace_replace_code(int enable)
} while_for_each_ftrace_rec();
}
+struct ftrace_rec_iter {
+ struct ftrace_page *pg;
+ int index;
+};
+
+/**
+ * ftrace_rec_iter_start, start up iterating over traced functions
+ *
+ * Returns an iterator handle that is used to iterate over all
+ * the records that represent address locations where functions
+ * are traced.
+ *
+ * May return NULL if no records are available.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_start(void)
+{
+ /*
+ * We only use a single iterator.
+ * Protected by the ftrace_lock mutex.
+ */
+ static struct ftrace_rec_iter ftrace_rec_iter;
+ struct ftrace_rec_iter *iter = &ftrace_rec_iter;
+
+ iter->pg = ftrace_pages_start;
+ iter->index = 0;
+
+ /* Could have empty pages */
+ while (iter->pg && !iter->pg->index)
+ iter->pg = iter->pg->next;
+
+ if (!iter->pg)
+ return NULL;
+
+ return iter;
+}
+
+/**
+ * ftrace_rec_iter_next, get the next record to process.
+ * @iter: The handle to the iterator.
+ *
+ * Returns the next iterator after the given iterator @iter.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
+{
+ iter->index++;
+
+ if (iter->index >= iter->pg->index) {
+ iter->pg = iter->pg->next;
+ iter->index = 0;
+
+ /* Could have empty pages */
+ while (iter->pg && !iter->pg->index)
+ iter->pg = iter->pg->next;
+ }
+
+ if (!iter->pg)
+ return NULL;
+
+ return iter;
+}
+
+/**
+ * ftrace_rec_iter_record, get the record at the iterator location
+ * @iter: The current iterator location
+ *
+ * Returns the record that the current @iter is at.
+ */
+struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
+{
+ return &iter->pg->records[iter->index];
+}
+
static int
ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
{
@@ -1617,13 +1756,7 @@ static int __ftrace_modify_code(void *data)
{
int *command = data;
- /*
- * Do not call function tracer while we update the code.
- * We are in stop machine, no worrying about races.
- */
- function_trace_stop++;
-
- if (*command & FTRACE_ENABLE_CALLS)
+ if (*command & FTRACE_UPDATE_CALLS)
ftrace_replace_code(1);
else if (*command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
@@ -1636,21 +1769,33 @@ static int __ftrace_modify_code(void *data)
else if (*command & FTRACE_STOP_FUNC_RET)
ftrace_disable_ftrace_graph_caller();
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
- /*
- * For archs that call ftrace_test_stop_func(), we must
- * wait till after we update all the function callers
- * before we update the callback. This keeps different
- * ops that record different functions from corrupting
- * each other.
- */
- __ftrace_trace_function = __ftrace_trace_function_delay;
-#endif
- function_trace_stop--;
-
return 0;
}
+/**
+ * ftrace_run_stop_machine, go back to the stop machine method
+ * @command: The command to tell ftrace what to do
+ *
+ * If an arch needs to fall back to the stop machine method, the
+ * it can call this function.
+ */
+void ftrace_run_stop_machine(int command)
+{
+ stop_machine(__ftrace_modify_code, &command, NULL);
+}
+
+/**
+ * arch_ftrace_update_code, modify the code to trace or not trace
+ * @command: The command that needs to be done
+ *
+ * Archs can override this function if it does not need to
+ * run stop_machine() to modify code.
+ */
+void __weak arch_ftrace_update_code(int command)
+{
+ ftrace_run_stop_machine(command);
+}
+
static void ftrace_run_update_code(int command)
{
int ret;
@@ -1659,8 +1804,31 @@ static void ftrace_run_update_code(int command)
FTRACE_WARN_ON(ret);
if (ret)
return;
+ /*
+ * Do not call function tracer while we update the code.
+ * We are in stop machine.
+ */
+ function_trace_stop++;
- stop_machine(__ftrace_modify_code, &command, NULL);
+ /*
+ * By default we use stop_machine() to modify the code.
+ * But archs can do what ever they want as long as it
+ * is safe. The stop_machine() is the safest, but also
+ * produces the most overhead.
+ */
+ arch_ftrace_update_code(command);
+
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ /*
+ * For archs that call ftrace_test_stop_func(), we must
+ * wait till after we update all the function callers
+ * before we update the callback. This keeps different
+ * ops that record different functions from corrupting
+ * each other.
+ */
+ __ftrace_trace_function = __ftrace_trace_function_delay;
+#endif
+ function_trace_stop--;
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
@@ -1691,7 +1859,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
return -ENODEV;
ftrace_start_up++;
- command |= FTRACE_ENABLE_CALLS;
+ command |= FTRACE_UPDATE_CALLS;
/* ops marked global share the filter hashes */
if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1743,8 +1911,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
if (ops != &global_ops || !global_start_up)
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
- if (!ftrace_start_up)
- command |= FTRACE_DISABLE_CALLS;
+ command |= FTRACE_UPDATE_CALLS;
if (saved_ftrace_func != ftrace_trace_function) {
saved_ftrace_func = ftrace_trace_function;
@@ -1766,7 +1933,7 @@ static void ftrace_startup_sysctl(void)
saved_ftrace_func = NULL;
/* ftrace_start_up is true if we want ftrace running */
if (ftrace_start_up)
- ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
}
static void ftrace_shutdown_sysctl(void)
@@ -1788,14 +1955,16 @@ static int ops_traces_mod(struct ftrace_ops *ops)
struct ftrace_hash *hash;
hash = ops->filter_hash;
- return !!(!hash || !hash->count);
+ return ftrace_hash_empty(hash);
}
static int ftrace_update_code(struct module *mod)
{
+ struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long ref = 0;
+ int i;
/*
* When adding a module, we need to check if tracers are
@@ -1817,46 +1986,44 @@ static int ftrace_update_code(struct module *mod)
start = ftrace_now(raw_smp_processor_id());
ftrace_update_cnt = 0;
- while (ftrace_new_addrs) {
+ for (pg = ftrace_new_pgs; pg; pg = pg->next) {
- /* If something went wrong, bail without enabling anything */
- if (unlikely(ftrace_disabled))
- return -1;
+ for (i = 0; i < pg->index; i++) {
+ /* If something went wrong, bail without enabling anything */
+ if (unlikely(ftrace_disabled))
+ return -1;
- p = ftrace_new_addrs;
- ftrace_new_addrs = p->newlist;
- p->flags = ref;
+ p = &pg->records[i];
+ p->flags = ref;
- /*
- * Do the initial record conversion from mcount jump
- * to the NOP instructions.
- */
- if (!ftrace_code_disable(mod, p)) {
- ftrace_free_rec(p);
- /* Game over */
- break;
- }
+ /*
+ * Do the initial record conversion from mcount jump
+ * to the NOP instructions.
+ */
+ if (!ftrace_code_disable(mod, p))
+ break;
- ftrace_update_cnt++;
+ ftrace_update_cnt++;
- /*
- * If the tracing is enabled, go ahead and enable the record.
- *
- * The reason not to enable the record immediatelly is the
- * inherent check of ftrace_make_nop/ftrace_make_call for
- * correct previous instructions. Making first the NOP
- * conversion puts the module to the correct state, thus
- * passing the ftrace_make_call check.
- */
- if (ftrace_start_up && ref) {
- int failed = __ftrace_replace_code(p, 1);
- if (failed) {
- ftrace_bug(failed, p->ip);
- ftrace_free_rec(p);
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ */
+ if (ftrace_start_up && ref) {
+ int failed = __ftrace_replace_code(p, 1);
+ if (failed)
+ ftrace_bug(failed, p->ip);
}
}
}
+ ftrace_new_pgs = NULL;
+
stop = ftrace_now(raw_smp_processor_id());
ftrace_update_time = stop - start;
ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -1864,57 +2031,108 @@ static int ftrace_update_code(struct module *mod)
return 0;
}
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+static int ftrace_allocate_records(struct ftrace_page *pg, int count)
{
- struct ftrace_page *pg;
+ int order;
int cnt;
- int i;
- /* allocate a few pages */
- ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
- if (!ftrace_pages_start)
- return -1;
+ if (WARN_ON(!count))
+ return -EINVAL;
+
+ order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
/*
- * Allocate a few more pages.
- *
- * TODO: have some parser search vmlinux before
- * final linking to find all calls to ftrace.
- * Then we can:
- * a) know how many pages to allocate.
- * and/or
- * b) set up the table then.
- *
- * The dynamic code is still necessary for
- * modules.
+ * We want to fill as much as possible. No more than a page
+ * may be empty.
*/
+ while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
+ order--;
- pg = ftrace_pages = ftrace_pages_start;
+ again:
+ pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
- cnt = num_to_init / ENTRIES_PER_PAGE;
- pr_info("ftrace: allocating %ld entries in %d pages\n",
- num_to_init, cnt + 1);
+ if (!pg->records) {
+ /* if we can't allocate this size, try something smaller */
+ if (!order)
+ return -ENOMEM;
+ order >>= 1;
+ goto again;
+ }
- for (i = 0; i < cnt; i++) {
- pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+ cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
+ pg->size = cnt;
- /* If we fail, we'll try later anyway */
- if (!pg->next)
+ if (cnt > count)
+ cnt = count;
+
+ return cnt;
+}
+
+static struct ftrace_page *
+ftrace_allocate_pages(unsigned long num_to_init)
+{
+ struct ftrace_page *start_pg;
+ struct ftrace_page *pg;
+ int order;
+ int cnt;
+
+ if (!num_to_init)
+ return 0;
+
+ start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+ if (!pg)
+ return NULL;
+
+ /*
+ * Try to allocate as much as possible in one continues
+ * location that fills in all of the space. We want to
+ * waste as little space as possible.
+ */
+ for (;;) {
+ cnt = ftrace_allocate_records(pg, num_to_init);
+ if (cnt < 0)
+ goto free_pages;
+
+ num_to_init -= cnt;
+ if (!num_to_init)
break;
+ pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
+ if (!pg->next)
+ goto free_pages;
+
pg = pg->next;
}
- return 0;
+ return start_pg;
+
+ free_pages:
+ while (start_pg) {
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ start_pg = pg->next;
+ kfree(pg);
+ pg = start_pg;
+ }
+ pr_info("ftrace: FAILED to allocate memory for functions\n");
+ return NULL;
}
-enum {
- FTRACE_ITER_FILTER = (1 << 0),
- FTRACE_ITER_NOTRACE = (1 << 1),
- FTRACE_ITER_PRINTALL = (1 << 2),
- FTRACE_ITER_HASH = (1 << 3),
- FTRACE_ITER_ENABLED = (1 << 4),
-};
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+{
+ int cnt;
+
+ if (!num_to_init) {
+ pr_info("ftrace: No functions to be traced?\n");
+ return -1;
+ }
+
+ cnt = num_to_init / ENTRIES_PER_PAGE;
+ pr_info("ftrace: allocating %ld entries in %d pages\n",
+ num_to_init, cnt + 1);
+
+ return 0;
+}
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1980,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
void *p = NULL;
loff_t l;
+ if (!(iter->flags & FTRACE_ITER_DO_HASH))
+ return NULL;
+
if (iter->func_pos > *pos)
return NULL;
@@ -2023,7 +2244,7 @@ static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- struct ftrace_ops *ops = &global_ops;
+ struct ftrace_ops *ops = iter->ops;
struct dyn_ftrace *rec = NULL;
if (unlikely(ftrace_disabled))
@@ -2047,9 +2268,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
}
} else {
rec = &iter->pg->records[iter->idx++];
- if ((rec->flags & FTRACE_FL_FREE) ||
-
- ((iter->flags & FTRACE_ITER_FILTER) &&
+ if (((iter->flags & FTRACE_ITER_FILTER) &&
!(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2081,7 +2300,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- struct ftrace_ops *ops = &global_ops;
+ struct ftrace_ops *ops = iter->ops;
void *p = NULL;
loff_t l;
@@ -2101,7 +2320,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* off, we can short cut and just print out that all
* functions are enabled.
*/
- if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
+ if (iter->flags & FTRACE_ITER_FILTER &&
+ ftrace_hash_empty(ops->filter_hash)) {
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2126,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
break;
}
- if (!p) {
- if (iter->flags & FTRACE_ITER_FILTER)
- return t_hash_start(m, pos);
-
- return NULL;
- }
+ if (!p)
+ return t_hash_start(m, pos);
return iter;
}
@@ -2189,6 +2405,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
return -ENOMEM;
iter->pg = ftrace_pages_start;
+ iter->ops = &global_ops;
ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
@@ -2217,6 +2434,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
iter->pg = ftrace_pages_start;
iter->flags = FTRACE_ITER_ENABLED;
+ iter->ops = &global_ops;
ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
@@ -2237,7 +2455,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
mutex_unlock(&ftrace_lock);
}
-static int
+/**
+ * ftrace_regex_open - initialize function tracer filter files
+ * @ops: The ftrace_ops that hold the hash filters
+ * @flag: The type of filter to process
+ * @inode: The inode, usually passed in to your open routine
+ * @file: The file, usually passed in to your open routine
+ *
+ * ftrace_regex_open() initializes the filter files for the
+ * @ops. Depending on @flag it may process the filter hash or
+ * the notrace hash of @ops. With this called from the open
+ * routine, you can use ftrace_filter_write() for the write
+ * routine if @flag has FTRACE_ITER_FILTER set, or
+ * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
+ * ftrace_regex_lseek() should be used as the lseek routine, and
+ * release must call ftrace_regex_release().
+ */
+int
ftrace_regex_open(struct ftrace_ops *ops, int flag,
struct inode *inode, struct file *file)
{
@@ -2306,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
static int
ftrace_filter_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
- inode, file);
+ return ftrace_regex_open(&global_ops,
+ FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+ inode, file);
}
static int
@@ -2317,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
-static loff_t
+loff_t
ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
{
loff_t ret;
@@ -2426,7 +2661,6 @@ match_records(struct ftrace_hash *hash, char *buff,
goto out_unlock;
do_for_each_ftrace_rec(pg, rec) {
-
if (ftrace_match_record(rec, mod, search, search_len, type)) {
ret = enter_record(hash, rec, not);
if (ret < 0) {
@@ -2871,14 +3105,14 @@ out_unlock:
return ret;
}
-static ssize_t
+ssize_t
ftrace_filter_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
}
-static ssize_t
+ssize_t
ftrace_notrace_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -2919,7 +3153,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
&& ftrace_enabled)
- ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
mutex_unlock(&ftrace_lock);
@@ -3045,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf)
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-static void __init
-set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
+void __init
+ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
{
char *func;
@@ -3059,17 +3293,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
static void __init set_ftrace_early_filters(void)
{
if (ftrace_filter_buf[0])
- set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
+ ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
if (ftrace_notrace_buf[0])
- set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
+ ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
if (ftrace_graph_buf[0])
set_ftrace_early_graph(ftrace_graph_buf);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
}
-static int
-ftrace_regex_release(struct inode *inode, struct file *file)
+int ftrace_regex_release(struct inode *inode, struct file *file)
{
struct seq_file *m = (struct seq_file *)file->private_data;
struct ftrace_iterator *iter;
@@ -3107,7 +3340,7 @@ ftrace_regex_release(struct inode *inode, struct file *file)
orig_hash, iter->hash);
if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
&& ftrace_enabled)
- ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
mutex_unlock(&ftrace_lock);
}
@@ -3270,9 +3503,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & FTRACE_FL_FREE)
- continue;
-
if (ftrace_match_record(rec, NULL, search, search_len, type)) {
/* if it is in the array */
exists = false;
@@ -3381,15 +3611,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
return 0;
}
+static void ftrace_swap_recs(void *a, void *b, int size)
+{
+ struct dyn_ftrace *reca = a;
+ struct dyn_ftrace *recb = b;
+ struct dyn_ftrace t;
+
+ t = *reca;
+ *reca = *recb;
+ *recb = t;
+}
+
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
+ struct ftrace_page *pg;
+ unsigned long count;
unsigned long *p;
unsigned long addr;
unsigned long flags = 0; /* Shut up gcc */
+ int ret = -ENOMEM;
+
+ count = end - start;
+
+ if (!count)
+ return 0;
+
+ pg = ftrace_allocate_pages(count);
+ if (!pg)
+ return -ENOMEM;
mutex_lock(&ftrace_lock);
+
+ /*
+ * Core and each module needs their own pages, as
+ * modules will free them when they are removed.
+ * Force a new page to be allocated for modules.
+ */
+ if (!mod) {
+ WARN_ON(ftrace_pages || ftrace_pages_start);
+ /* First initialization */
+ ftrace_pages = ftrace_pages_start = pg;
+ } else {
+ if (!ftrace_pages)
+ goto out;
+
+ if (WARN_ON(ftrace_pages->next)) {
+ /* Hmm, we have free pages? */
+ while (ftrace_pages->next)
+ ftrace_pages = ftrace_pages->next;
+ }
+
+ ftrace_pages->next = pg;
+ ftrace_pages = pg;
+ }
+
p = start;
while (p < end) {
addr = ftrace_call_adjust(*p++);
@@ -3401,9 +3678,18 @@ static int ftrace_process_locs(struct module *mod,
*/
if (!addr)
continue;
- ftrace_record_ip(addr);
+ if (!ftrace_record_ip(addr))
+ break;
}
+ /* These new locations need to be initialized */
+ ftrace_new_pgs = pg;
+
+ /* Make each individual set of pages sorted by ips */
+ for (; pg; pg = pg->next)
+ sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs, ftrace_swap_recs);
+
/*
* We only need to disable interrupts on start up
* because we are modifying code that an interrupt
@@ -3417,32 +3703,55 @@ static int ftrace_process_locs(struct module *mod,
ftrace_update_code(mod);
if (!mod)
local_irq_restore(flags);
+ ret = 0;
+ out:
mutex_unlock(&ftrace_lock);
- return 0;
+ return ret;
}
#ifdef CONFIG_MODULES
+
+#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
+ struct ftrace_page **last_pg;
struct ftrace_page *pg;
+ int order;
mutex_lock(&ftrace_lock);
if (ftrace_disabled)
goto out_unlock;
- do_for_each_ftrace_rec(pg, rec) {
+ /*
+ * Each module has its own ftrace_pages, remove
+ * them from the list.
+ */
+ last_pg = &ftrace_pages_start;
+ for (pg = ftrace_pages_start; pg; pg = *last_pg) {
+ rec = &pg->records[0];
if (within_module_core(rec->ip, mod)) {
/*
- * rec->ip is changed in ftrace_free_rec()
- * It should not between s and e if record was freed.
+ * As core pages are first, the first
+ * page should never be a module page.
*/
- FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
- ftrace_free_rec(rec);
- }
- } while_for_each_ftrace_rec();
+ if (WARN_ON(pg == ftrace_pages_start))
+ goto out_unlock;
+
+ /* Check if we are deleting the last page */
+ if (pg == ftrace_pages)
+ ftrace_pages = next_to_ftrace_page(last_pg);
+
+ *last_pg = pg->next;
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ kfree(pg);
+ } else
+ last_pg = &pg->next;
+ }
out_unlock:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f04cc3136bd3..24aee7127451 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1738,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system,
return -ENOMEM;
}
+static int create_filter_start(char *filter_str, bool set_str,
+ struct filter_parse_state **psp,
+ struct event_filter **filterp)
+{
+ struct event_filter *filter;
+ struct filter_parse_state *ps = NULL;
+ int err = 0;
+
+ WARN_ON_ONCE(*psp || *filterp);
+
+ /* allocate everything, and if any fails, free all and fail */
+ filter = __alloc_filter();
+ if (filter && set_str)
+ err = replace_filter_string(filter, filter_str);
+
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+
+ if (!filter || !ps || err) {
+ kfree(ps);
+ __free_filter(filter);
+ return -ENOMEM;
+ }
+
+ /* we're committed to creating a new filter */
+ *filterp = filter;
+ *psp = ps;
+
+ parse_init(ps, filter_ops, filter_str);
+ err = filter_parse(ps);
+ if (err && set_str)
+ append_filter_err(ps, filter);
+ return err;
+}
+
+static void create_filter_finish(struct filter_parse_state *ps)
+{
+ if (ps) {
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+ }
+}
+
+/**
+ * create_filter - create a filter for a ftrace_event_call
+ * @call: ftrace_event_call to create a filter for
+ * @filter_str: filter string
+ * @set_str: remember @filter_str and enable detailed error in filter
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Creates a filter for @call with @filter_str. If @set_str is %true,
+ * @filter_str is copied and recorded in the new filter.
+ *
+ * On success, returns 0 and *@filterp points to the new filter. On
+ * failure, returns -errno and *@filterp may point to %NULL or to a new
+ * filter. In the latter case, the returned filter contains error
+ * information if @set_str is %true and the caller is responsible for
+ * freeing it.
+ */
+static int create_filter(struct ftrace_event_call *call,
+ char *filter_str, bool set_str,
+ struct event_filter **filterp)
+{
+ struct event_filter *filter = NULL;
+ struct filter_parse_state *ps = NULL;
+ int err;
+
+ err = create_filter_start(filter_str, set_str, &ps, &filter);
+ if (!err) {
+ err = replace_preds(call, filter, ps, filter_str, false);
+ if (err && set_str)
+ append_filter_err(ps, filter);
+ }
+ create_filter_finish(ps);
+
+ *filterp = filter;
+ return err;
+}
+
+/**
+ * create_system_filter - create a filter for an event_subsystem
+ * @system: event_subsystem to create a filter for
+ * @filter_str: filter string
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Identical to create_filter() except that it creates a subsystem filter
+ * and always remembers @filter_str.
+ */
+static int create_system_filter(struct event_subsystem *system,
+ char *filter_str, struct event_filter **filterp)
+{
+ struct event_filter *filter = NULL;
+ struct filter_parse_state *ps = NULL;
+ int err;
+
+ err = create_filter_start(filter_str, true, &ps, &filter);
+ if (!err) {
+ err = replace_system_preds(system, ps, filter_str);
+ if (!err) {
+ /* System filters just show a default message */
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+ } else {
+ append_filter_err(ps, filter);
+ }
+ }
+ create_filter_finish(ps);
+
+ *filterp = filter;
+ return err;
+}
+
int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
- struct filter_parse_state *ps;
struct event_filter *filter;
- struct event_filter *tmp;
int err = 0;
mutex_lock(&event_mutex);
@@ -1759,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
goto out_unlock;
}
- err = -ENOMEM;
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps)
- goto out_unlock;
-
- filter = __alloc_filter();
- if (!filter) {
- kfree(ps);
- goto out_unlock;
- }
-
- replace_filter_string(filter, filter_string);
-
- parse_init(ps, filter_ops, filter_string);
- err = filter_parse(ps);
- if (err) {
- append_filter_err(ps, filter);
- goto out;
- }
+ err = create_filter(call, filter_string, true, &filter);
- err = replace_preds(call, filter, ps, filter_string, false);
- if (err) {
- filter_disable(call);
- append_filter_err(ps, filter);
- } else
- call->flags |= TRACE_EVENT_FL_FILTERED;
-out:
/*
* Always swap the call filter with the new filter
* even if there was an error. If there was an error
* in the filter, we disable the filter and show the error
* string
*/
- tmp = call->filter;
- rcu_assign_pointer(call->filter, filter);
- if (tmp) {
- /* Make sure the call is done with the filter */
- synchronize_sched();
- __free_filter(tmp);
+ if (filter) {
+ struct event_filter *tmp = call->filter;
+
+ if (!err)
+ call->flags |= TRACE_EVENT_FL_FILTERED;
+ else
+ filter_disable(call);
+
+ rcu_assign_pointer(call->filter, filter);
+
+ if (tmp) {
+ /* Make sure the call is done with the filter */
+ synchronize_sched();
+ __free_filter(tmp);
+ }
}
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
out_unlock:
mutex_unlock(&event_mutex);
@@ -1811,7 +1902,6 @@ out_unlock:
int apply_subsystem_event_filter(struct event_subsystem *system,
char *filter_string)
{
- struct filter_parse_state *ps;
struct event_filter *filter;
int err = 0;
@@ -1835,48 +1925,19 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
goto out_unlock;
}
- err = -ENOMEM;
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps)
- goto out_unlock;
-
- filter = __alloc_filter();
- if (!filter)
- goto out;
-
- /* System filters just show a default message */
- kfree(filter->filter_string);
- filter->filter_string = NULL;
-
- /*
- * No event actually uses the system filter
- * we can free it without synchronize_sched().
- */
- __free_filter(system->filter);
- system->filter = filter;
-
- parse_init(ps, filter_ops, filter_string);
- err = filter_parse(ps);
- if (err)
- goto err_filter;
-
- err = replace_system_preds(system, ps, filter_string);
- if (err)
- goto err_filter;
-
-out:
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
+ err = create_system_filter(system, filter_string, &filter);
+ if (filter) {
+ /*
+ * No event actually uses the system filter
+ * we can free it without synchronize_sched().
+ */
+ __free_filter(system->filter);
+ system->filter = filter;
+ }
out_unlock:
mutex_unlock(&event_mutex);
return err;
-
-err_filter:
- replace_filter_string(filter, filter_string);
- append_filter_err(ps, system->filter);
- goto out;
}
#ifdef CONFIG_PERF_EVENTS
@@ -1894,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
{
int err;
struct event_filter *filter;
- struct filter_parse_state *ps;
struct ftrace_event_call *call;
mutex_lock(&event_mutex);
@@ -1909,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
if (event->filter)
goto out_unlock;
- filter = __alloc_filter();
- if (!filter) {
- err = PTR_ERR(filter);
- goto out_unlock;
- }
-
- err = -ENOMEM;
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps)
- goto free_filter;
-
- parse_init(ps, filter_ops, filter_str);
- err = filter_parse(ps);
- if (err)
- goto free_ps;
-
- err = replace_preds(call, filter, ps, filter_str, false);
+ err = create_filter(call, filter_str, false, &filter);
if (!err)
event->filter = filter;
-
-free_ps:
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
-
-free_filter:
- if (err)
+ else
__free_filter(filter);
out_unlock:
@@ -1954,43 +1991,6 @@ out_unlock:
#define CREATE_TRACE_POINTS
#include "trace_events_filter_test.h"
-static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
- struct event_filter **pfilter)
-{
- struct event_filter *filter;
- struct filter_parse_state *ps;
- int err = -ENOMEM;
-
- filter = __alloc_filter();
- if (!filter)
- goto out;
-
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps)
- goto free_filter;
-
- parse_init(ps, filter_ops, filter_str);
- err = filter_parse(ps);
- if (err)
- goto free_ps;
-
- err = replace_preds(call, filter, ps, filter_str, false);
- if (!err)
- *pfilter = filter;
-
- free_ps:
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
-
- free_filter:
- if (err)
- __free_filter(filter);
-
- out:
- return err;
-}
-
#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
{ \
.filter = FILTER, \
@@ -2109,12 +2109,13 @@ static __init int ftrace_test_event_filter(void)
struct test_filter_data_t *d = &test_filter_data[i];
int err;
- err = test_get_filter(d->filter, &event_ftrace_test_filter,
- &filter);
+ err = create_filter(&event_ftrace_test_filter, d->filter,
+ false, &filter);
if (err) {
printk(KERN_INFO
"Failed to get filter for '%s', err %d\n",
d->filter, err);
+ __free_filter(filter);
break;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 77575b386d97..d4545f49242e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,9 @@
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/fs.h>
+
+#include <asm/setup.h>
+
#include "trace.h"
#define STACK_TRACE_ENTRIES 500
@@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = stack_trace_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
};
static ssize_t
@@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = {
.release = seq_release,
};
+static int
+stack_trace_filter_open(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
+ inode, file);
+}
+
+static const struct file_operations stack_trace_filter_fops = {
+ .open = stack_trace_filter_open,
+ .read = seq_read,
+ .write = ftrace_filter_write,
+ .llseek = ftrace_regex_lseek,
+ .release = ftrace_regex_release,
+};
+
int
stack_trace_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
return ret;
}
+static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
+
static __init int enable_stacktrace(char *str)
{
+ if (strncmp(str, "_filter=", 8) == 0)
+ strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
+
stack_tracer_enabled = 1;
last_stack_tracer_enabled = 1;
return 1;
@@ -358,6 +380,12 @@ static __init int stack_trace_init(void)
trace_create_file("stack_trace", 0444, d_tracer,
NULL, &stack_trace_fops);
+ trace_create_file("stack_trace_filter", 0444, d_tracer,
+ NULL, &stack_trace_filter_fops);
+
+ if (stack_trace_filter_buf[0])
+ ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
+
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index db110b8ae030..f1539decd99d 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod)
int ret = 0;
/*
- * We skip modules that tain the kernel, especially those with different
- * module header (for forced load), to make sure we don't cause a crash.
+ * We skip modules that taint the kernel, especially those with different
+ * module headers (for forced load), to make sure we don't cause a crash.
+ * Staging and out-of-tree GPL modules are fine.
*/
- if (mod->taints)
+ if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
return 0;
mutex_lock(&tracepoints_mutex);
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1d7bca7f4f52..d117262deba3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
- printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
print_modules();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a810..bec7b5b53e03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
int nr_drainers; /* W: drain in progress */
int saved_max_active; /* W: saved cwq max_active */
- const char *name; /* I: workqueue name */
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
+ char name[]; /* I: workqueue name */
};
struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
return clamp_val(max_active, 1, lim);
}
-struct workqueue_struct *__alloc_workqueue_key(const char *name,
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
unsigned int flags,
int max_active,
struct lock_class_key *key,
- const char *lock_name)
+ const char *lock_name, ...)
{
+ va_list args, args1;
struct workqueue_struct *wq;
unsigned int cpu;
+ size_t namelen;
+
+ /* determine namelen, allocate wq and format name */
+ va_start(args, lock_name);
+ va_copy(args1, args);
+ namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+
+ wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+ if (!wq)
+ goto err;
+
+ vsnprintf(wq->name, namelen, fmt, args1);
+ va_end(args);
+ va_end(args1);
/*
* Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
flags |= WQ_HIGHPRI;
max_active = max_active ?: WQ_DFL_ACTIVE;
- max_active = wq_clamp_max_active(max_active, flags, name);
-
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
- if (!wq)
- goto err;
+ max_active = wq_clamp_max_active(max_active, flags, wq->name);
+ /* init wq */
wq->flags = flags;
wq->saved_max_active = max_active;
mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
INIT_LIST_HEAD(&wq->flusher_queue);
INIT_LIST_HEAD(&wq->flusher_overflow);
- wq->name = name;
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
INIT_LIST_HEAD(&wq->list);
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
if (!rescuer)
goto err;
- rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+ rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+ wq->name);
if (IS_ERR(rescuer->task))
goto err;