diff options
Diffstat (limited to 'kernel')
51 files changed, 2377 insertions, 1429 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e1c5bf3365c0..2921d90ce32f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o + notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ + async.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/async.c b/kernel/async.c new file mode 100644 index 000000000000..f286e9f2b736 --- /dev/null +++ b/kernel/async.c @@ -0,0 +1,335 @@ +/* + * async.c: Asynchronous function calls for boot performance + * + * (C) Copyright 2009 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + + +/* + +Goals and Theory of Operation + +The primary goal of this feature is to reduce the kernel boot time, +by doing various independent hardware delays and discovery operations +decoupled and not strictly serialized. + +More specifically, the asynchronous function call concept allows +certain operations (primarily during system boot) to happen +asynchronously, out of order, while these operations still +have their externally visible parts happen sequentially and in-order. +(not unlike how out-of-order CPUs retire their instructions in order) + +Key to the asynchronous function call implementation is the concept of +a "sequence cookie" (which, although it has an abstracted type, can be +thought of as a monotonically incrementing number). + +The async core will assign each scheduled event such a sequence cookie and +pass this to the called functions. + +The asynchronously called function should before doing a globally visible +operation, such as registering device numbers, call the +async_synchronize_cookie() function and pass in its own cookie. The +async_synchronize_cookie() function will make sure that all asynchronous +operations that were scheduled prior to the operation corresponding with the +cookie have completed. + +Subsystem/driver initialization code that scheduled asynchronous probe +functions, but which shares global resources with other drivers/subsystems +that do not use the asynchronous call feature, need to do a full +synchronization with the async_synchronize_full() function, before returning +from their init function. This is to maintain strict ordering between the +asynchronous and synchronous parts of the kernel. + +*/ + +#include <linux/async.h> +#include <linux/module.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/kthread.h> +#include <asm/atomic.h> + +static async_cookie_t next_cookie = 1; + +#define MAX_THREADS 256 +#define MAX_WORK 32768 + +static LIST_HEAD(async_pending); +static LIST_HEAD(async_running); +static DEFINE_SPINLOCK(async_lock); + +static int async_enabled = 0; + +struct async_entry { + struct list_head list; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; +}; + +static DECLARE_WAIT_QUEUE_HEAD(async_done); +static DECLARE_WAIT_QUEUE_HEAD(async_new); + +static atomic_t entry_count; +static atomic_t thread_count; + +extern int initcall_debug; + + +/* + * MUST be called with the lock held! + */ +static async_cookie_t __lowest_in_progress(struct list_head *running) +{ + struct async_entry *entry; + if (!list_empty(&async_pending)) { + entry = list_first_entry(&async_pending, + struct async_entry, list); + return entry->cookie; + } else if (!list_empty(running)) { + entry = list_first_entry(running, + struct async_entry, list); + return entry->cookie; + } else { + /* nothing in progress... next_cookie is "infinity" */ + return next_cookie; + } + +} +/* + * pick the first pending entry and run it + */ +static void run_one_entry(void) +{ + unsigned long flags; + struct async_entry *entry; + ktime_t calltime, delta, rettime; + + /* 1) pick one task from the pending queue */ + + spin_lock_irqsave(&async_lock, flags); + if (list_empty(&async_pending)) + goto out; + entry = list_first_entry(&async_pending, struct async_entry, list); + + /* 2) move it to the running queue */ + list_del(&entry->list); + list_add_tail(&entry->list, &async_running); + spin_unlock_irqrestore(&async_lock, flags); + + /* 3) run it (and print duration)*/ + if (initcall_debug && system_state == SYSTEM_BOOTING) { + printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); + calltime = ktime_get(); + } + entry->func(entry->data, entry->cookie); + if (initcall_debug && system_state == SYSTEM_BOOTING) { + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, + entry->func, ktime_to_ns(delta) >> 10); + } + + /* 4) remove it from the running queue */ + spin_lock_irqsave(&async_lock, flags); + list_del(&entry->list); + + /* 5) free the entry */ + kfree(entry); + atomic_dec(&entry_count); + + spin_unlock_irqrestore(&async_lock, flags); + + /* 6) wake up any waiters. */ + wake_up(&async_done); + return; + +out: + spin_unlock_irqrestore(&async_lock, flags); +} + + +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +{ + struct async_entry *entry; + unsigned long flags; + async_cookie_t newcookie; + + + /* allow irq-off callers */ + entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); + + /* + * If we're out of memory or if there's too much work + * pending already, we execute synchronously. + */ + if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { + kfree(entry); + spin_lock_irqsave(&async_lock, flags); + newcookie = next_cookie++; + spin_unlock_irqrestore(&async_lock, flags); + + /* low on memory.. run synchronously */ + ptr(data, newcookie); + return newcookie; + } + entry->func = ptr; + entry->data = data; + entry->running = running; + + spin_lock_irqsave(&async_lock, flags); + newcookie = entry->cookie = next_cookie++; + list_add_tail(&entry->list, &async_pending); + atomic_inc(&entry_count); + spin_unlock_irqrestore(&async_lock, flags); + wake_up(&async_new); + return newcookie; +} + +async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +{ + return __async_schedule(ptr, data, &async_pending); +} +EXPORT_SYMBOL_GPL(async_schedule); + +async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) +{ + return __async_schedule(ptr, data, running); +} +EXPORT_SYMBOL_GPL(async_schedule_special); + +void async_synchronize_full(void) +{ + do { + async_synchronize_cookie(next_cookie); + } while (!list_empty(&async_running) || !list_empty(&async_pending)); +} +EXPORT_SYMBOL_GPL(async_synchronize_full); + +void async_synchronize_full_special(struct list_head *list) +{ + async_synchronize_cookie_special(next_cookie, list); +} +EXPORT_SYMBOL_GPL(async_synchronize_full_special); + +void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) +{ + ktime_t starttime, delta, endtime; + + if (initcall_debug && system_state == SYSTEM_BOOTING) { + printk("async_waiting @ %i\n", task_pid_nr(current)); + starttime = ktime_get(); + } + + wait_event(async_done, __lowest_in_progress(running) >= cookie); + + if (initcall_debug && system_state == SYSTEM_BOOTING) { + endtime = ktime_get(); + delta = ktime_sub(endtime, starttime); + + printk("async_continuing @ %i after %lli usec\n", + task_pid_nr(current), ktime_to_ns(delta) >> 10); + } +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); + +void async_synchronize_cookie(async_cookie_t cookie) +{ + async_synchronize_cookie_special(cookie, &async_running); +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie); + + +static int async_thread(void *unused) +{ + DECLARE_WAITQUEUE(wq, current); + add_wait_queue(&async_new, &wq); + + while (!kthread_should_stop()) { + int ret = HZ; + set_current_state(TASK_INTERRUPTIBLE); + /* + * check the list head without lock.. false positives + * are dealt with inside run_one_entry() while holding + * the lock. + */ + rmb(); + if (!list_empty(&async_pending)) + run_one_entry(); + else + ret = schedule_timeout(HZ); + + if (ret == 0) { + /* + * we timed out, this means we as thread are redundant. + * we sign off and die, but we to avoid any races there + * is a last-straw check to see if work snuck in. + */ + atomic_dec(&thread_count); + wmb(); /* manager must see our departure first */ + if (list_empty(&async_pending)) + break; + /* + * woops work came in between us timing out and us + * signing off; we need to stay alive and keep working. + */ + atomic_inc(&thread_count); + } + } + remove_wait_queue(&async_new, &wq); + + return 0; +} + +static int async_manager_thread(void *unused) +{ + DECLARE_WAITQUEUE(wq, current); + add_wait_queue(&async_new, &wq); + + while (!kthread_should_stop()) { + int tc, ec; + + set_current_state(TASK_INTERRUPTIBLE); + + tc = atomic_read(&thread_count); + rmb(); + ec = atomic_read(&entry_count); + + while (tc < ec && tc < MAX_THREADS) { + kthread_run(async_thread, NULL, "async/%i", tc); + atomic_inc(&thread_count); + tc++; + } + + schedule(); + } + remove_wait_queue(&async_new, &wq); + + return 0; +} + +static int __init async_init(void) +{ + if (async_enabled) + kthread_run(async_manager_thread, NULL, "async/mgr"); + return 0; +} + +static int __init setup_async(char *str) +{ + async_enabled = 1; + return 1; +} + +__setup("fastboot", setup_async); + + +core_initcall(async_init); diff --git a/kernel/audit.h b/kernel/audit.h index 9d6717412fec..16f18cac661b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return __audit_signal_info(sig, t); return 0; } -extern enum audit_state audit_filter_inodes(struct task_struct *, - struct audit_context *); -extern void audit_set_auditable(struct audit_context *); +extern void audit_filter_inodes(struct task_struct *, struct audit_context *); #else #define audit_signal_info(s,t) AUDIT_DISABLED #define audit_filter_inodes(t,c) AUDIT_DISABLED -#define audit_set_auditable(c) #endif diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8b509441f49a..8ad9545b8db9 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree) audit_log_end(ab); rule->tree = NULL; list_del_rcu(&entry->list); + list_del(&entry->rule.list); call_rcu(&entry->rcu, audit_free_rule_rcu); } } @@ -617,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) if (pathname[0] != '/' || rule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || + op != Audit_equal || rule->inode_f || rule->watch || rule->tree) return -EINVAL; rule->tree = alloc_tree(pathname); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9fd85a4640a0..fbf24d121d97 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #error Fix audit_filter_list initialiser #endif }; +static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_rules_list[0]), + LIST_HEAD_INIT(audit_rules_list[1]), + LIST_HEAD_INIT(audit_rules_list[2]), + LIST_HEAD_INIT(audit_rules_list[3]), + LIST_HEAD_INIT(audit_rules_list[4]), + LIST_HEAD_INIT(audit_rules_list[5]), +}; DEFINE_MUTEX(audit_filter_mutex); @@ -244,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree) + krule->watch || krule->inode_f || krule->tree || + (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; krule->inode_f = f; @@ -262,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, if (path[0] != '/' || path[len-1] == '/' || krule->listnr != AUDIT_FILTER_EXIT || - op & ~AUDIT_EQUAL || + op != Audit_equal || krule->inode_f || krule->watch || krule->tree) return -EINVAL; @@ -412,12 +421,32 @@ exit_err: return ERR_PTR(err); } +static u32 audit_ops[] = +{ + [Audit_equal] = AUDIT_EQUAL, + [Audit_not_equal] = AUDIT_NOT_EQUAL, + [Audit_bitmask] = AUDIT_BIT_MASK, + [Audit_bittest] = AUDIT_BIT_TEST, + [Audit_lt] = AUDIT_LESS_THAN, + [Audit_gt] = AUDIT_GREATER_THAN, + [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL, + [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL, +}; + +static u32 audit_to_op(u32 op) +{ + u32 n; + for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++) + ; + return n; +} + + /* Translate struct audit_rule to kernel's rule respresentation. * Exists for backward compatibility with userspace. */ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; - struct audit_field *ino_f; int err = 0; int i; @@ -427,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 n; + + n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); + + /* Support for legacy operators where + * AUDIT_NEGATE bit signifies != and otherwise assumes == */ + if (n & AUDIT_NEGATE) + f->op = Audit_not_equal; + else if (!n) + f->op = Audit_equal; + else + f->op = audit_to_op(n); + + entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; - f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; err = -EINVAL; + if (f->op == Audit_bad) + goto exit_free; + switch(f->type) { default: goto exit_free; @@ -454,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_EXIT: case AUDIT_SUCCESS: /* bit ops are only useful on syscall args */ - if (f->op == AUDIT_BIT_MASK || - f->op == AUDIT_BIT_TEST) { - err = -EINVAL; + if (f->op == Audit_bitmask || f->op == Audit_bittest) goto exit_free; - } break; case AUDIT_ARG0: case AUDIT_ARG1: @@ -467,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) break; /* arch is only allowed to be = or != */ case AUDIT_ARCH: - if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) - && (f->op != AUDIT_NEGATE) && (f->op)) { - err = -EINVAL; + if (f->op != Audit_not_equal && f->op != Audit_equal) goto exit_free; - } entry->rule.arch_f = f; break; case AUDIT_PERM: @@ -488,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; } - - entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (f->op & AUDIT_NEGATE) - f->op = AUDIT_NOT_EQUAL; - else if (!f->op) - f->op = AUDIT_EQUAL; - else if (f->op == AUDIT_OPERATORS) { - err = -EINVAL; - goto exit_free; - } } - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; exit_nofree: return entry; @@ -530,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; - struct audit_field *ino_f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -546,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, struct audit_field *f = &entry->rule.fields[i]; err = -EINVAL; - if (!(data->fieldflags[i] & AUDIT_OPERATORS) || - data->fieldflags[i] & ~AUDIT_OPERATORS) + + f->op = audit_to_op(data->fieldflags[i]); + if (f->op == Audit_bad) goto exit_free; - f->op = data->fieldflags[i] & AUDIT_OPERATORS; f->type = data->fields[i]; f->val = data->values[i]; f->lsm_str = NULL; @@ -662,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, } } - ino_f = entry->rule.inode_f; - if (ino_f) { - switch(ino_f->op) { - case AUDIT_NOT_EQUAL: - entry->rule.inode_f = NULL; - case AUDIT_EQUAL: - break; - default: - err = -EINVAL; - goto exit_free; - } - } + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; exit_nofree: return entry; @@ -713,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) rule->fields[i] = krule->fields[i].type; if (krule->vers_ops == 1) { - if (krule->fields[i].op & AUDIT_NOT_EQUAL) + if (krule->fields[i].op == Audit_not_equal) rule->fields[i] |= AUDIT_NEGATE; } else { - rule->fields[i] |= krule->fields[i].op; + rule->fields[i] |= audit_ops[krule->fields[i].op]; } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; @@ -744,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) struct audit_field *f = &krule->fields[i]; data->fields[i] = f->type; - data->fieldflags[i] = f->op; + data->fieldflags[i] = audit_ops[f->op]; switch(f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: @@ -919,6 +924,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->action = old->action; for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; + new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; new->watch = NULL; @@ -987,9 +993,8 @@ static void audit_update_watch(struct audit_parent *parent, /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context && - audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) - audit_set_auditable(current->audit_context); + if (invalidating && current->audit_context) + audit_filter_inodes(current, current->audit_context); nwatch = audit_dupe_watch(owatch); if (IS_ERR(nwatch)) { @@ -1007,12 +1012,15 @@ static void audit_update_watch(struct audit_parent *parent, list_del_rcu(&oentry->list); nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (IS_ERR(nentry)) + if (IS_ERR(nentry)) { + list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); - else { + } else { int h = audit_hash_ino((u32)ino); list_add(&nentry->rule.rlist, &nwatch->rules); list_add_rcu(&nentry->list, &audit_inode_hash[h]); + list_replace(&oentry->rule.list, + &nentry->rule.list); } call_rcu(&oentry->rcu, audit_free_rule_rcu); @@ -1077,6 +1085,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) audit_log_end(ab); } list_del(&r->rlist); + list_del(&r->list); list_del_rcu(&e->list); call_rcu(&e->rcu, audit_free_rule_rcu); } @@ -1102,12 +1111,16 @@ static void audit_inotify_unregister(struct list_head *in_list) /* Find an existing audit rule. * Caller must hold audit_filter_mutex to prevent stale rule data. */ static struct audit_entry *audit_find_rule(struct audit_entry *entry, - struct list_head *list) + struct list_head **p) { struct audit_entry *e, *found = NULL; + struct list_head *list; int h; - if (entry->rule.watch) { + if (entry->rule.inode_f) { + h = audit_hash_ino(entry->rule.inode_f->val); + *p = list = &audit_inode_hash[h]; + } else if (entry->rule.watch) { /* we don't know the inode number, so must walk entire hash */ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { list = &audit_inode_hash[h]; @@ -1118,6 +1131,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry, } } goto out; + } else { + *p = list = &audit_filter_list[entry->rule.listnr]; } list_for_each_entry(e, list, list) @@ -1258,15 +1273,17 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, return ret; } +static u64 prio_low = ~0ULL/2; +static u64 prio_high = ~0ULL/2 - 1; + /* Add rule to given filterlist if not a duplicate. */ -static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_add_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct nameidata *ndp = NULL, *ndw = NULL; + struct list_head *list; int h, err; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1277,13 +1294,8 @@ static inline int audit_add_rule(struct audit_entry *entry, dont_count = 1; #endif - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); + e = audit_find_rule(entry, &list); mutex_unlock(&audit_filter_mutex); if (e) { err = -EEXIST; @@ -1319,10 +1331,22 @@ static inline int audit_add_rule(struct audit_entry *entry, } } + entry->rule.prio = ~0ULL; + if (entry->rule.listnr == AUDIT_FILTER_EXIT) { + if (entry->rule.flags & AUDIT_FILTER_PREPEND) + entry->rule.prio = ++prio_high; + else + entry->rule.prio = --prio_low; + } + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + list_add(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); list_add_rcu(&entry->list, list); entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { + list_add_tail(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); list_add_tail_rcu(&entry->list, list); } #ifdef CONFIG_AUDITSYSCALL @@ -1345,15 +1369,14 @@ error: } /* Remove an existing rule from filterlist. */ -static inline int audit_del_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_del_rule(struct audit_entry *entry) { struct audit_entry *e; - struct audit_field *inode_f = entry->rule.inode_f; struct audit_watch *watch, *tmp_watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; + struct list_head *list; LIST_HEAD(inotify_list); - int h, ret = 0; + int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -1363,13 +1386,8 @@ static inline int audit_del_rule(struct audit_entry *entry, dont_count = 1; #endif - if (inode_f) { - h = audit_hash_ino(inode_f->val); - list = &audit_inode_hash[h]; - } - mutex_lock(&audit_filter_mutex); - e = audit_find_rule(entry, list); + e = audit_find_rule(entry, &list); if (!e) { mutex_unlock(&audit_filter_mutex); ret = -ENOENT; @@ -1404,6 +1422,7 @@ static inline int audit_del_rule(struct audit_entry *entry, audit_remove_tree_rule(&e->rule); list_del_rcu(&e->list); + list_del(&e->rule.list); call_rcu(&e->rcu, audit_free_rule_rcu); #ifdef CONFIG_AUDITSYSCALL @@ -1432,30 +1451,16 @@ out: static void audit_list(int pid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; - struct audit_entry *entry; + struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i=0; i<AUDIT_NR_FILTERS; i++) { - list_for_each_entry(entry, &audit_filter_list[i], list) { - struct audit_rule *rule; - - rule = audit_krule_to_rule(&entry->rule); - if (unlikely(!rule)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, - rule, sizeof(*rule)); - if (skb) - skb_queue_tail(q, skb); - kfree(rule); - } - } - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(entry, &audit_inode_hash[i], list) { + list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule *rule; - rule = audit_krule_to_rule(&entry->rule); + rule = audit_krule_to_rule(r); if (unlikely(!rule)) break; skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, @@ -1474,30 +1479,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q) static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { struct sk_buff *skb; - struct audit_entry *e; + struct audit_krule *r; int i; /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ for (i=0; i<AUDIT_NR_FILTERS; i++) { - list_for_each_entry(e, &audit_filter_list[i], list) { - struct audit_rule_data *data; - - data = audit_krule_to_data(&e->rule); - if (unlikely(!data)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data) + data->buflen); - if (skb) - skb_queue_tail(q, skb); - kfree(data); - } - } - for (i=0; i< AUDIT_INODE_BUCKETS; i++) { - list_for_each_entry(e, &audit_inode_hash[i], list) { + list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule_data *data; - data = audit_krule_to_data(&e->rule); + data = audit_krule_to_data(r); if (unlikely(!data)) break; skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, @@ -1603,8 +1594,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, if (IS_ERR(entry)) return PTR_ERR(entry); - err = audit_add_rule(entry, - &audit_filter_list[entry->rule.listnr]); + err = audit_add_rule(entry); audit_log_rule_change(loginuid, sessionid, sid, "add", &entry->rule, !err); @@ -1620,8 +1610,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, if (IS_ERR(entry)) return PTR_ERR(entry); - err = audit_del_rule(entry, - &audit_filter_list[entry->rule.listnr]); + err = audit_del_rule(entry); audit_log_rule_change(loginuid, sessionid, sid, "remove", &entry->rule, !err); @@ -1634,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return err; } -int audit_comparator(const u32 left, const u32 op, const u32 right) +int audit_comparator(u32 left, u32 op, u32 right) { switch (op) { - case AUDIT_EQUAL: + case Audit_equal: return (left == right); - case AUDIT_NOT_EQUAL: + case Audit_not_equal: return (left != right); - case AUDIT_LESS_THAN: + case Audit_lt: return (left < right); - case AUDIT_LESS_THAN_OR_EQUAL: + case Audit_le: return (left <= right); - case AUDIT_GREATER_THAN: + case Audit_gt: return (left > right); - case AUDIT_GREATER_THAN_OR_EQUAL: + case Audit_ge: return (left >= right); - case AUDIT_BIT_MASK: + case Audit_bitmask: return (left & right); - case AUDIT_BIT_TEST: + case Audit_bittest: return ((left & right) == right); + default: + BUG(); + return 0; } - BUG(); - return 0; } /* Compare given dentry name with last component in given path, @@ -1778,6 +1768,43 @@ unlock_and_return: return result; } +static int update_lsm_rule(struct audit_krule *r) +{ + struct audit_entry *entry = container_of(r, struct audit_entry, rule); + struct audit_entry *nentry; + struct audit_watch *watch; + struct audit_tree *tree; + int err = 0; + + if (!security_audit_rule_known(r)) + return 0; + + watch = r->watch; + tree = r->tree; + nentry = audit_dupe_rule(r, watch); + if (IS_ERR(nentry)) { + /* save the first error encountered for the + * return value */ + err = PTR_ERR(nentry); + audit_panic("error updating LSM filters"); + if (watch) + list_del(&r->rlist); + list_del_rcu(&entry->list); + list_del(&r->list); + } else { + if (watch) { + list_add(&nentry->rule.rlist, &watch->rules); + list_del(&r->rlist); + } else if (tree) + list_replace_init(&r->rlist, &nentry->rule.rlist); + list_replace_rcu(&entry->list, &nentry->list); + list_replace(&r->list, &nentry->rule.list); + } + call_rcu(&entry->rcu, audit_free_rule_rcu); + + return err; +} + /* This function will re-initialize the lsm_rule field of all applicable rules. * It will traverse the filter lists serarching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the @@ -1785,45 +1812,19 @@ unlock_and_return: * updated rule. */ int audit_update_lsm_rules(void) { - struct audit_entry *entry, *n, *nentry; - struct audit_watch *watch; - struct audit_tree *tree; + struct audit_krule *r, *n; int i, err = 0; /* audit_filter_mutex synchronizes the writers */ mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { - list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { - if (!security_audit_rule_known(&entry->rule)) - continue; - - watch = entry->rule.watch; - tree = entry->rule.tree; - nentry = audit_dupe_rule(&entry->rule, watch); - if (IS_ERR(nentry)) { - /* save the first error encountered for the - * return value */ - if (!err) - err = PTR_ERR(nentry); - audit_panic("error updating LSM filters"); - if (watch) - list_del(&entry->rule.rlist); - list_del_rcu(&entry->list); - } else { - if (watch) { - list_add(&nentry->rule.rlist, - &watch->rules); - list_del(&entry->rule.rlist); - } else if (tree) - list_replace_init(&entry->rule.rlist, - &nentry->rule.rlist); - list_replace_rcu(&entry->list, &nentry->list); - } - call_rcu(&entry->rcu, audit_free_rule_rcu); + list_for_each_entry_safe(r, n, &audit_rules_list[i], list) { + int res = update_lsm_rule(r); + if (!err) + err = res; } } - mutex_unlock(&audit_filter_mutex); return err; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4819f3711973..8cbddff6c283 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -124,43 +124,6 @@ struct audit_aux_data { /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 -struct audit_aux_data_mq_open { - struct audit_aux_data d; - int oflag; - mode_t mode; - struct mq_attr attr; -}; - -struct audit_aux_data_mq_sendrecv { - struct audit_aux_data d; - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; -}; - -struct audit_aux_data_mq_notify { - struct audit_aux_data d; - mqd_t mqdes; - struct sigevent notification; -}; - -struct audit_aux_data_mq_getsetattr { - struct audit_aux_data d; - mqd_t mqdes; - struct mq_attr mqstat; -}; - -struct audit_aux_data_ipcctl { - struct audit_aux_data d; - struct ipc_perm p; - unsigned long qbytes; - uid_t uid; - gid_t gid; - mode_t mode; - u32 osid; -}; - struct audit_aux_data_execve { struct audit_aux_data d; int argc; @@ -168,23 +131,6 @@ struct audit_aux_data_execve { struct mm_struct *mm; }; -struct audit_aux_data_socketcall { - struct audit_aux_data d; - int nargs; - unsigned long args[0]; -}; - -struct audit_aux_data_sockaddr { - struct audit_aux_data d; - int len; - char a[0]; -}; - -struct audit_aux_data_fd_pair { - struct audit_aux_data d; - int fd[2]; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -219,14 +165,14 @@ struct audit_tree_refs { struct audit_context { int dummy; /* must be the first element */ int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state; + enum audit_state state, current_state; unsigned int serial; /* serial number for record */ struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ unsigned long argv[4]; /* syscall arguments */ int return_valid; /* return code is valid */ long return_code;/* syscall return code */ - int auditable; /* 1 if record should be written */ + u64 prio; int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -234,7 +180,8 @@ struct audit_context { struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; - + struct sockaddr_storage *sockaddr; + size_t sockaddr_len; /* Save things to print about task_struct */ pid_t pid, ppid; uid_t uid, euid, suid, fsuid; @@ -252,6 +199,49 @@ struct audit_context { struct audit_tree_refs *trees, *first_trees; int tree_count; + int type; + union { + struct { + int nargs; + long args[6]; + } socketcall; + struct { + uid_t uid; + gid_t gid; + mode_t mode; + u32 osid; + int has_perm; + uid_t perm_uid; + gid_t perm_gid; + mode_t perm_mode; + unsigned long qbytes; + } ipc; + struct { + mqd_t mqdes; + struct mq_attr mqstat; + } mq_getsetattr; + struct { + mqd_t mqdes; + int sigev_signo; + } mq_notify; + struct { + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; + } mq_sendrecv; + struct { + int oflag; + mode_t mode; + struct mq_attr attr; + } mq_open; + struct { + pid_t pid; + struct audit_cap_data cap; + } capset; + }; + int fds[2]; + #if AUDIT_DEBUG int put_count; int ino_count; @@ -608,19 +598,12 @@ static int audit_filter_rules(struct task_struct *tsk, } } /* Find ipc objects that match */ - if (ctx) { - struct audit_aux_data *aux; - for (aux = ctx->aux; aux; - aux = aux->next) { - if (aux->type == AUDIT_IPC) { - struct audit_aux_data_ipcctl *axi = (void *)aux; - if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { - ++result; - break; - } - } - } - } + if (!ctx || ctx->type != AUDIT_IPC) + break; + if (security_audit_rule_match(ctx->ipc.osid, + f->type, f->op, + f->lsm_rule, ctx)) + ++result; } break; case AUDIT_ARG0: @@ -647,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk, return 0; } } - if (rule->filterkey && ctx) - ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + + if (ctx) { + if (rule->prio <= ctx->prio) + return 0; + if (rule->filterkey) { + kfree(ctx->filterkey); + ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + } + ctx->prio = rule->prio; + } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; @@ -661,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk, * completely disabled for this task. Since we only have the task * structure at this point, we can only check uid and gid. */ -static enum audit_state audit_filter_task(struct task_struct *tsk) +static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) { struct audit_entry *e; enum audit_state state; @@ -669,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { + if (state == AUDIT_RECORD_CONTEXT) + *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); return state; } @@ -702,6 +695,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, audit_filter_rules(tsk, &e->rule, ctx, NULL, &state)) { rcu_read_unlock(); + ctx->current_state = state; return state; } } @@ -715,15 +709,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, * buckets applicable to the inode numbers in audit_names[]. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ -enum audit_state audit_filter_inodes(struct task_struct *tsk, - struct audit_context *ctx) +void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { int i; struct audit_entry *e; enum audit_state state; if (audit_pid && tsk->tgid == audit_pid) - return AUDIT_DISABLED; + return; rcu_read_lock(); for (i = 0; i < ctx->name_count; i++) { @@ -740,17 +733,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk, if ((e->rule.mask[word] & bit) == bit && audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { rcu_read_unlock(); - return state; + ctx->current_state = state; + return; } } } rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; } -void audit_set_auditable(struct audit_context *ctx) +static void audit_set_auditable(struct audit_context *ctx) { - ctx->auditable = 1; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } } static inline struct audit_context *audit_get_context(struct task_struct *tsk, @@ -781,23 +777,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, else context->return_code = return_code; - if (context->in_syscall && !context->dummy && !context->auditable) { - enum audit_state state; - - state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); - if (state == AUDIT_RECORD_CONTEXT) { - context->auditable = 1; - goto get_context; - } - - state = audit_filter_inodes(tsk, context); - if (state == AUDIT_RECORD_CONTEXT) - context->auditable = 1; - + if (context->in_syscall && !context->dummy) { + audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(tsk, context); } -get_context: - tsk->audit_context = NULL; return context; } @@ -807,8 +791,7 @@ static inline void audit_free_names(struct audit_context *context) int i; #if AUDIT_DEBUG == 2 - if (context->auditable - ||context->put_count + context->ino_count != context->name_count) { + if (context->put_count + context->ino_count != context->name_count) { printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", @@ -859,6 +842,7 @@ static inline void audit_zero_context(struct audit_context *context, { memset(context, 0, sizeof(*context)); context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; } static inline struct audit_context *audit_alloc_context(enum audit_state state) @@ -884,18 +868,21 @@ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; enum audit_state state; + char *key = NULL; if (likely(!audit_ever_enabled)) return 0; /* Return if not auditing. */ - state = audit_filter_task(tsk); + state = audit_filter_task(tsk, &key); if (likely(state == AUDIT_DISABLED)) return 0; if (!(context = audit_alloc_context(state))) { + kfree(key); audit_log_lost("out of memory in audit_alloc"); return -ENOMEM; } + context->filterkey = key; tsk->audit_context = context; set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); @@ -921,6 +908,7 @@ static inline void audit_free_context(struct audit_context *context) free_tree_refs(context); audit_free_aux(context); kfree(context->filterkey); + kfree(context->sockaddr); kfree(context); context = previous; } while (context); @@ -1230,6 +1218,97 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); } +static void show_special(struct audit_context *context, int *call_panic) +{ + struct audit_buffer *ab; + int i; + + ab = audit_log_start(context, GFP_KERNEL, context->type); + if (!ab) + return; + + switch (context->type) { + case AUDIT_SOCKETCALL: { + int nargs = context->socketcall.nargs; + audit_log_format(ab, "nargs=%d", nargs); + for (i = 0; i < nargs; i++) + audit_log_format(ab, " a%d=%lx", i, + context->socketcall.args[i]); + break; } + case AUDIT_IPC: { + u32 osid = context->ipc.osid; + + audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", + context->ipc.uid, context->ipc.gid, context->ipc.mode); + if (osid) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx(osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", osid); + *call_panic = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + if (context->ipc.has_perm) { + audit_log_end(ab); + ab = audit_log_start(context, GFP_KERNEL, + AUDIT_IPC_SET_PERM); + audit_log_format(ab, + "qbytes=%lx ouid=%u ogid=%u mode=%#o", + context->ipc.qbytes, + context->ipc.perm_uid, + context->ipc.perm_gid, + context->ipc.perm_mode); + if (!ab) + return; + } + break; } + case AUDIT_MQ_OPEN: { + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + context->mq_open.oflag, context->mq_open.mode, + context->mq_open.attr.mq_flags, + context->mq_open.attr.mq_maxmsg, + context->mq_open.attr.mq_msgsize, + context->mq_open.attr.mq_curmsgs); + break; } + case AUDIT_MQ_SENDRECV: { + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + context->mq_sendrecv.mqdes, + context->mq_sendrecv.msg_len, + context->mq_sendrecv.msg_prio, + context->mq_sendrecv.abs_timeout.tv_sec, + context->mq_sendrecv.abs_timeout.tv_nsec); + break; } + case AUDIT_MQ_NOTIFY: { + audit_log_format(ab, "mqdes=%d sigev_signo=%d", + context->mq_notify.mqdes, + context->mq_notify.sigev_signo); + break; } + case AUDIT_MQ_GETSETATTR: { + struct mq_attr *attr = &context->mq_getsetattr.mqstat; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + context->mq_getsetattr.mqdes, + attr->mq_flags, attr->mq_maxmsg, + attr->mq_msgsize, attr->mq_curmsgs); + break; } + case AUDIT_CAPSET: { + audit_log_format(ab, "pid=%d", context->capset.pid); + audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); + audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); + audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); + break; } + } + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1307,94 +1386,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts continue; /* audit_panic has been called */ switch (aux->type) { - case AUDIT_MQ_OPEN: { - struct audit_aux_data_mq_open *axi = (void *)aux; - audit_log_format(ab, - "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " - "mq_msgsize=%ld mq_curmsgs=%ld", - axi->oflag, axi->mode, axi->attr.mq_flags, - axi->attr.mq_maxmsg, axi->attr.mq_msgsize, - axi->attr.mq_curmsgs); - break; } - - case AUDIT_MQ_SENDRECV: { - struct audit_aux_data_mq_sendrecv *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d msg_len=%zd msg_prio=%u " - "abs_timeout_sec=%ld abs_timeout_nsec=%ld", - axi->mqdes, axi->msg_len, axi->msg_prio, - axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); - break; } - - case AUDIT_MQ_NOTIFY: { - struct audit_aux_data_mq_notify *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d sigev_signo=%d", - axi->mqdes, - axi->notification.sigev_signo); - break; } - - case AUDIT_MQ_GETSETATTR: { - struct audit_aux_data_mq_getsetattr *axi = (void *)aux; - audit_log_format(ab, - "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " - "mq_curmsgs=%ld ", - axi->mqdes, - axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, - axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); - break; } - - case AUDIT_IPC: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "ouid=%u ogid=%u mode=%#o", - axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - break; } - - case AUDIT_IPC_SET_PERM: { - struct audit_aux_data_ipcctl *axi = (void *)aux; - audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%#o", - axi->qbytes, axi->uid, axi->gid, axi->mode); - break; } case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; audit_log_execve_info(context, &ab, axi); break; } - case AUDIT_SOCKETCALL: { - struct audit_aux_data_socketcall *axs = (void *)aux; - audit_log_format(ab, "nargs=%d", axs->nargs); - for (i=0; i<axs->nargs; i++) - audit_log_format(ab, " a%d=%lx", i, axs->args[i]); - break; } - - case AUDIT_SOCKADDR: { - struct audit_aux_data_sockaddr *axs = (void *)aux; - - audit_log_format(ab, "saddr="); - audit_log_n_hex(ab, axs->a, axs->len); - break; } - - case AUDIT_FD_PAIR: { - struct audit_aux_data_fd_pair *axs = (void *)aux; - audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); - break; } - case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -1409,18 +1406,32 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); break; } - case AUDIT_CAPSET: { - struct audit_aux_data_capset *axs = (void *)aux; - audit_log_format(ab, "pid=%d", axs->pid); - audit_log_cap(ab, "cap_pi", &axs->cap.inheritable); - audit_log_cap(ab, "cap_pp", &axs->cap.permitted); - audit_log_cap(ab, "cap_pe", &axs->cap.effective); - break; } - } audit_log_end(ab); } + if (context->type) + show_special(context, &call_panic); + + if (context->fds[0] >= 0) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); + if (ab) { + audit_log_format(ab, "fd0=%d fd1=%d", + context->fds[0], context->fds[1]); + audit_log_end(ab); + } + } + + if (context->sockaddr_len) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); + if (ab) { + audit_log_format(ab, "saddr="); + audit_log_n_hex(ab, (void *)context->sockaddr, + context->sockaddr_len); + audit_log_end(ab); + } + } + for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; @@ -1536,7 +1547,7 @@ void audit_free(struct task_struct *tsk) * We use GFP_ATOMIC here because we might be doing this * in the context of the idle thread */ /* that can happen only if we are called from do_exit() */ - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); audit_free_context(context); @@ -1620,15 +1631,17 @@ void audit_syscall_entry(int arch, int major, state = context->state; context->dummy = !audit_n_rules; - if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) + if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { + context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); + } if (likely(state == AUDIT_DISABLED)) return; context->serial = 0; context->ctime = CURRENT_TIME; context->in_syscall = 1; - context->auditable = !!(state == AUDIT_RECORD_CONTEXT); + context->current_state = state; context->ppid = 0; } @@ -1636,17 +1649,20 @@ void audit_finish_fork(struct task_struct *child) { struct audit_context *ctx = current->audit_context; struct audit_context *p = child->audit_context; - if (!p || !ctx || !ctx->auditable) + if (!p || !ctx) + return; + if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) return; p->arch = ctx->arch; p->major = ctx->major; memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); p->ctime = ctx->ctime; p->dummy = ctx->dummy; - p->auditable = ctx->auditable; p->in_syscall = ctx->in_syscall; p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); p->ppid = current->pid; + p->prio = ctx->prio; + p->current_state = ctx->current_state; } /** @@ -1670,11 +1686,11 @@ void audit_syscall_exit(int valid, long return_code) if (likely(!context)) return; - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) audit_log_exit(context, tsk); context->in_syscall = 0; - context->auditable = 0; + context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; if (context->previous) { struct audit_context *new_context = context->previous; @@ -1689,8 +1705,13 @@ void audit_syscall_exit(int valid, long return_code) context->aux_pids = NULL; context->target_pid = 0; context->target_sid = 0; - kfree(context->filterkey); - context->filterkey = NULL; + context->sockaddr_len = 0; + context->type = 0; + context->fds[0] = -1; + if (context->state != AUDIT_RECORD_CONTEXT) { + kfree(context->filterkey); + context->filterkey = NULL; + } tsk->audit_context = context; } } @@ -2081,7 +2102,10 @@ int auditsc_get_stamp(struct audit_context *ctx, t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; - ctx->auditable = 1; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } return 1; } @@ -2127,132 +2151,46 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) * @mode: mode bits * @u_attr: queue attributes * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) +void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) { - struct audit_aux_data_mq_open *ax; struct audit_context *context = current->audit_context; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_attr != NULL) { - if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->attr, 0, sizeof(ax->attr)); + if (attr) + memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); + else + memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); - ax->oflag = oflag; - ax->mode = mode; + context->mq_open.oflag = oflag; + context->mq_open.mode = mode; - ax->d.type = AUDIT_MQ_OPEN; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_MQ_OPEN; } /** - * __audit_mq_timedsend - record audit data for a POSIX MQ timed send + * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time + * @abs_timeout: Message timeout in absolute time * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, - const struct timespec __user *u_abs_timeout) +void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec *abs_timeout) { - struct audit_aux_data_mq_sendrecv *ax; struct audit_context *context = current->audit_context; + struct timespec *p = &context->mq_sendrecv.abs_timeout; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); - - ax->mqdes = mqdes; - ax->msg_len = msg_len; - ax->msg_prio = msg_prio; - - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** - * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive - * @mqdes: MQ descriptor - * @msg_len: Message length - * @u_msg_prio: Message priority - * @u_abs_timeout: Message timeout in absolute time - * - * Returns 0 for success or NULL context or < 0 on error. - */ -int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, - unsigned int __user *u_msg_prio, - const struct timespec __user *u_abs_timeout) -{ - struct audit_aux_data_mq_sendrecv *ax; - struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_msg_prio != NULL) { - if (get_user(ax->msg_prio, u_msg_prio)) { - kfree(ax); - return -EFAULT; - } - } else - ax->msg_prio = 0; - - if (u_abs_timeout != NULL) { - if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + if (abs_timeout) + memcpy(p, abs_timeout, sizeof(struct timespec)); + else + memset(p, 0, sizeof(struct timespec)); - ax->mqdes = mqdes; - ax->msg_len = msg_len; + context->mq_sendrecv.mqdes = mqdes; + context->mq_sendrecv.msg_len = msg_len; + context->mq_sendrecv.msg_prio = msg_prio; - ax->d.type = AUDIT_MQ_SENDRECV; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_MQ_SENDRECV; } /** @@ -2260,38 +2198,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, * @mqdes: MQ descriptor * @u_notification: Notification event * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { - struct audit_aux_data_mq_notify *ax; struct audit_context *context = current->audit_context; - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - if (u_notification != NULL) { - if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { - kfree(ax); - return -EFAULT; - } - } else - memset(&ax->notification, 0, sizeof(ax->notification)); - - ax->mqdes = mqdes; + if (notification) + context->mq_notify.sigev_signo = notification->sigev_signo; + else + context->mq_notify.sigev_signo = 0; - ax->d.type = AUDIT_MQ_NOTIFY; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->mq_notify.mqdes = mqdes; + context->type = AUDIT_MQ_NOTIFY; } /** @@ -2299,55 +2218,29 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) * @mqdes: MQ descriptor * @mqstat: MQ flags * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { - struct audit_aux_data_mq_getsetattr *ax; struct audit_context *context = current->audit_context; - - if (!audit_enabled) - return 0; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->mqdes = mqdes; - ax->mqstat = *mqstat; - - ax->d.type = AUDIT_MQ_GETSETATTR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->mq_getsetattr.mqdes = mqdes; + context->mq_getsetattr.mqstat = *mqstat; + context->type = AUDIT_MQ_GETSETATTR; } /** * audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_ipc_obj(struct kern_ipc_perm *ipcp) +void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { - struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->uid = ipcp->uid; - ax->gid = ipcp->gid; - ax->mode = ipcp->mode; - security_ipc_getsecid(ipcp, &ax->osid); - ax->d.type = AUDIT_IPC; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->ipc.uid = ipcp->uid; + context->ipc.gid = ipcp->gid; + context->ipc.mode = ipcp->mode; + context->ipc.has_perm = 0; + security_ipc_getsecid(ipcp, &context->ipc.osid); + context->type = AUDIT_IPC; } /** @@ -2357,26 +2250,17 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @gid: msgq group id * @mode: msgq mode (permissions) * - * Returns 0 for success or NULL context or < 0 on error. + * Called only after audit_ipc_obj(). */ -int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { - struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->qbytes = qbytes; - ax->uid = uid; - ax->gid = gid; - ax->mode = mode; - - ax->d.type = AUDIT_IPC_SET_PERM; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->ipc.qbytes = qbytes; + context->ipc.perm_uid = uid; + context->ipc.perm_gid = gid; + context->ipc.perm_mode = mode; + context->ipc.has_perm = 1; } int audit_bprm(struct linux_binprm *bprm) @@ -2406,27 +2290,17 @@ int audit_bprm(struct linux_binprm *bprm) * @nargs: number of args * @args: args array * - * Returns 0 for success or NULL context or < 0 on error. */ -int audit_socketcall(int nargs, unsigned long *args) +void audit_socketcall(int nargs, unsigned long *args) { - struct audit_aux_data_socketcall *ax; struct audit_context *context = current->audit_context; if (likely(!context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->nargs = nargs; - memcpy(ax->args, args, nargs * sizeof(unsigned long)); + return; - ax->d.type = AUDIT_SOCKETCALL; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_SOCKETCALL; + context->socketcall.nargs = nargs; + memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); } /** @@ -2434,29 +2308,12 @@ int audit_socketcall(int nargs, unsigned long *args) * @fd1: the first file descriptor * @fd2: the second file descriptor * - * Returns 0 for success or NULL context or < 0 on error. */ -int __audit_fd_pair(int fd1, int fd2) +void __audit_fd_pair(int fd1, int fd2) { struct audit_context *context = current->audit_context; - struct audit_aux_data_fd_pair *ax; - - if (likely(!context)) { - return 0; - } - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) { - return -ENOMEM; - } - - ax->fd[0] = fd1; - ax->fd[1] = fd2; - - ax->d.type = AUDIT_FD_PAIR; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->fds[0] = fd1; + context->fds[1] = fd2; } /** @@ -2468,22 +2325,20 @@ int __audit_fd_pair(int fd1, int fd2) */ int audit_sockaddr(int len, void *a) { - struct audit_aux_data_sockaddr *ax; struct audit_context *context = current->audit_context; if (likely(!context || context->dummy)) return 0; - ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->len = len; - memcpy(ax->a, a, len); + if (!context->sockaddr) { + void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); + if (!p) + return -ENOMEM; + context->sockaddr = p; + } - ax->d.type = AUDIT_SOCKADDR; - ax->d.next = context->aux; - context->aux = (void *)ax; + context->sockaddr_len = len; + memcpy(context->sockaddr, a, len); return 0; } @@ -2617,29 +2472,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, * Record the aguments userspace sent to sys_capset for later printing by the * audit system if applicable */ -int __audit_log_capset(pid_t pid, +void __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old) { - struct audit_aux_data_capset *ax; struct audit_context *context = current->audit_context; - - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->d.type = AUDIT_CAPSET; - ax->d.next = context->aux; - context->aux = (void *)ax; - - ax->pid = pid; - ax->cap.effective = new->cap_effective; - ax->cap.inheritable = new->cap_effective; - ax->cap.permitted = new->cap_permitted; - - return 0; + context->capset.pid = pid; + context->capset.cap.effective = new->cap_effective; + context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.permitted = new->cap_permitted; + context->type = AUDIT_CAPSET; } /** diff --git a/kernel/capability.c b/kernel/capability.c index 36b4b4daebec..688926e496be 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (ret < 0) goto error; - ret = audit_log_capset(pid, new, current_cred()); - if (ret < 0) - return ret; + audit_log_capset(pid, new, current_cred()); return commit_creds(new); @@ -308,7 +306,7 @@ int capable(int cap) BUG(); } - if (has_capability(current, cap)) { + if (security_capable(cap) == 0) { current->flags |= PF_SUPERPRIV; return 1; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 48348dde6d81..c29831076e7a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -84,7 +84,7 @@ struct cgroupfs_root { /* Tracks how many cgroups are currently defined in hierarchy.*/ int number_of_cgroups; - /* A list running through the mounted hierarchies */ + /* A list running through the active hierarchies */ struct list_head root_list; /* Hierarchy-specific flags */ @@ -116,7 +116,6 @@ static int root_count; * be called. */ static int need_forkexit_callback __read_mostly; -static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_subsys(_root, _ss) \ list_for_each_entry(_ss, &_root->subsys_list, sibling) -/* for_each_root() allows you to iterate across the active hierarchies */ -#define for_each_root(_root) \ +/* for_each_active_root() allows you to iterate across the active hierarchies */ +#define for_each_active_root(_root) \ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by @@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = cg->subsys[i]->cgroup; + struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) @@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } +/** + * link_css_set - a helper function to link a css_set to a cgroup + * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() + * @cg: the css_set to be linked + * @cgrp: the destination cgroup + */ +static void link_css_set(struct list_head *tmp_cg_links, + struct css_set *cg, struct cgroup *cgrp) +{ + struct cg_cgroup_link *link; + + BUG_ON(list_empty(tmp_cg_links)); + link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, + cgrp_link_list); + link->cg = cg; + list_move(&link->cgrp_link_list, &cgrp->css_sets); + list_add(&link->cg_link_list, &cg->cg_links); +} + /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -400,7 +418,6 @@ static struct css_set *find_css_set( int i; struct list_head tmp_cg_links; - struct cg_cgroup_link *link; struct hlist_head *hhead; @@ -445,26 +462,11 @@ static struct css_set *find_css_set( * only do it for the first subsystem in each * hierarchy */ - if (ss->root->subsys_list.next == &ss->sibling) { - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &cgrp->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); - } - } - if (list_empty(&rootnode.subsys_list)) { - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - list_add(&link->cgrp_link_list, &dummytop->css_sets); - link->cg = res; - list_add(&link->cg_link_list, &res->cg_links); + if (ss->root->subsys_list.next == &ss->sibling) + link_css_set(&tmp_cg_links, res, cgrp); } + if (list_empty(&rootnode.subsys_list)) + link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -573,7 +575,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; } @@ -588,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) + if (ss->pre_destroy) ss->pre_destroy(ss, cgrp); return; } +static void free_cgroup_rcu(struct rcu_head *obj) +{ + struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); + + kfree(cgrp); +} + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -612,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) /* * Release the subsystem state objects. */ - for_each_subsys(cgrp->root, ss) { - if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); - } + for_each_subsys(cgrp->root, ss) + ss->destroy(ss, cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); - /* Drop the active superblock reference that we took when we - * created the cgroup */ + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ deactivate_super(cgrp->root->sb); - kfree(cgrp); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); } @@ -714,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); + mutex_lock(&ss->hierarchy_mutex); cgrp->subsys[i] = dummytop->subsys[i]; cgrp->subsys[i]->cgroup = cgrp; - list_add(&ss->sibling, &root->subsys_list); - rcu_assign_pointer(ss->root, root); + list_move(&ss->sibling, &root->subsys_list); + ss->root = root; if (ss->bind) ss->bind(ss, cgrp); - + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + mutex_lock(&ss->hierarchy_mutex); if (ss->bind) ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; - rcu_assign_pointer(subsys[i]->root, &rootnode); - list_del(&ss->sibling); + subsys[i]->root = &rootnode; + list_move(&ss->sibling, &rootnode.subsys_list); + mutex_unlock(&ss->hierarchy_mutex); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); @@ -992,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, root = NULL; } else { /* New superblock */ - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; int i; @@ -1033,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, list_add(&root->root_list, &roots); root_count++; - sb->s_root->d_fsdata = &root->top_cgroup; + sb->s_root->d_fsdata = root_cgrp; root->top_cgroup.dentry = sb->s_root; /* Link the top cgroup in this hierarchy into all @@ -1044,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct hlist_node *node; struct css_set *cg; - hlist_for_each_entry(cg, node, hhead, hlist) { - struct cg_cgroup_link *link; - - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - link->cg = cg; - list_add(&link->cgrp_link_list, - &root->top_cgroup.css_sets); - list_add(&link->cg_link_list, &cg->cg_links); - } + hlist_for_each_entry(cg, node, hhead, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); } write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&cgrp->sibling)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->sibling)); + BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cgroup_populate_dir(cgrp); + cgroup_populate_dir(root_cgrp); mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); } @@ -1115,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) { } write_unlock(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - root_count--; - } + list_del(&root->root_list); + root_count--; + mutex_unlock(&cgroup_mutex); kfree(root); @@ -1147,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry) * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held. Writes path of cgroup into buf. - * Returns 0 on success, -errno on error. + * Called with cgroup_mutex held or else with an RCU-protected cgroup + * reference. Writes path of cgroup into buf. Returns 0 on success, + * -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; + struct dentry *dentry = rcu_dereference(cgrp->dentry); - if (cgrp == dummytop) { + if (!dentry || cgrp == dummytop) { /* * Inactive subsystems have no dentry for their root * cgroup @@ -1167,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) *--start = '\0'; for (;;) { - int len = cgrp->dentry->d_name.len; + int len = dentry->d_name.len; if ((start -= len) < buf) return -ENAMETOOLONG; memcpy(start, cgrp->dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; + dentry = rcu_dereference(cgrp->dentry); if (!cgrp->parent) continue; if (--start < buf) @@ -1218,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) int retval = 0; struct cgroup_subsys *ss; struct cgroup *oldcgrp; - struct css_set *cg = tsk->cgroups; + struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; int subsys_id; @@ -1238,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } + task_lock(tsk); + cg = tsk->cgroups; + get_css_set(cg); + task_unlock(tsk); /* * Locate or allocate a new css_set for this task, * based on its final set of cgroups */ newcg = find_css_set(cg, cgrp); + put_css_set(cg); if (!newcg) return -ENOMEM; @@ -1447,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); @@ -1492,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (!cft || cgroup_is_removed(cgrp)) + if (cgroup_is_removed(cgrp)) return -ENODEV; if (cft->read) @@ -1556,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); - if (!cft) - return -ENODEV; + if (cft->read_map || cft->read_seq_string) { struct cgroup_seqfile_state *state = kzalloc(sizeof(*state), GFP_USER); @@ -1673,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, if (!error) { dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); - cgrp->dentry = dentry; + rcu_assign_pointer(cgrp->dentry, dentry); dget(dentry); } dput(dentry); @@ -1814,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, { struct task_struct *res; struct list_head *l = it->task; + struct cg_cgroup_link *link; /* If the iterator cg is NULL, we have no tasks */ if (!it->cg_link) @@ -1821,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, res = list_entry(l, struct task_struct, cg_list); /* Advance iterator to find next entry */ l = l->next; - if (l == &res->cgroups->tasks) { + link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); + if (l == &link->cg->tasks) { /* We reached the end of this task list - move on to * the next cg_cgroup_link */ cgroup_advance_iter(cgrp, it); @@ -2015,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) { - int n = 0; + int n = 0, pid; struct cgroup_iter it; struct task_struct *tsk; cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { if (unlikely(n == npids)) break; - pidarray[n++] = task_pid_vnr(tsk); + pid = task_pid_vnr(tsk); + if (pid > 0) + pidarray[n++] = pid; } cgroup_iter_end(cgrp, &it); return n; @@ -2054,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - rcu_read_lock(); cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { @@ -2079,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } cgroup_iter_end(cgrp, &it); - rcu_read_unlock(); err: return ret; } @@ -2326,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 0); + atomic_set(&css->refcnt, 1); css->flags = 0; if (cgrp == dummytop) set_bit(CSS_ROOT, &css->flags); @@ -2334,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; } +static void cgroup_lock_hierarchy(struct cgroupfs_root *root) +{ + /* We need to take each hierarchy_mutex in a consistent order */ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_lock_nested(&ss->hierarchy_mutex, i); + } +} + +static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) +{ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->root == root) + mutex_unlock(&ss->hierarchy_mutex); + } +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -2382,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_css(css, ss, cgrp); } + cgroup_lock_hierarchy(root); list_add(&cgrp->sibling, &cgrp->parent->children); + cgroup_unlock_hierarchy(root); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -2433,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the - * cgroup, if the css refcount is also 0, then there should + * cgroup, if the css refcount is also 1, then there should * be no outstanding references, so the subsystem is safe to * destroy. We scan across all subsystems rather than using * the per-hierarchy linked list of mounted subsystems since @@ -2454,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ - if (css && atomic_read(&css->refcnt)) + if (css && (atomic_read(&css->refcnt) > 1)) return 1; } return 0; } +/* + * Atomically mark all (or else none) of the cgroup's CSS objects as + * CSS_REMOVED. Return true on success, or false if the cgroup has + * busy subsystems. Call with cgroup_mutex held + */ + +static int cgroup_clear_css_refs(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + unsigned long flags; + bool failed = false; + local_irq_save(flags); + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + int refcnt; + do { + /* We can only remove a CSS with a refcnt==1 */ + refcnt = atomic_read(&css->refcnt); + if (refcnt > 1) { + failed = true; + goto done; + } + BUG_ON(!refcnt); + /* + * Drop the refcnt to 0 while we check other + * subsystems. This will cause any racing + * css_tryget() to spin until we set the + * CSS_REMOVED bits or abort + */ + } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); + } + done: + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (failed) { + /* + * Restore old refcnt if we previously managed + * to clear it from 1 to 0 + */ + if (!atomic_read(&css->refcnt)) + atomic_set(&css->refcnt, 1); + } else { + /* Commit the fact that the CSS is removed */ + set_bit(CSS_REMOVED, &css->flags); + } + } + local_irq_restore(flags); + return !failed; +} + static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; - struct super_block *sb; - struct cgroupfs_root *root; /* the vfs holds both inode->i_mutex already */ @@ -2489,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_lock(&cgroup_mutex); parent = cgrp->parent; - root = cgrp->root; - sb = root->sb; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children) - || cgroup_has_css_refs(cgrp)) { + || !cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } @@ -2504,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); spin_unlock(&release_list_lock); - /* delete my sibling from parent->children */ + + cgroup_lock_hierarchy(cgrp->root); + /* delete this cgroup from parent->children */ list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(cgrp->root); + spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); spin_unlock(&d->d_lock); @@ -2527,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); /* Create the top cgroup state for this subsystem */ + list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; css = ss->create(ss, dummytop); /* We don't handle early failures gracefully */ @@ -2540,13 +2623,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; - need_mm_owner_callback |= !!ss->mm_owner_changed; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); + mutex_init(&ss->hierarchy_mutex); ss->active = 1; } @@ -2565,7 +2648,6 @@ int __init cgroup_init_early(void) INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&rootnode); - list_add(&rootnode.root_list, &roots); root_count = 1; init_task.cgroups = &init_css_set; @@ -2672,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); - for_each_root(root) { + for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int subsys_id; int count = 0; - /* Skip this hierarchy if it has no active subsystems */ - if (!root->actual_subsys_bits) - continue; seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); @@ -2790,37 +2869,6 @@ void cgroup_fork_callbacks(struct task_struct *child) } } -#ifdef CONFIG_MM_OWNER -/** - * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes - * @p: the new owner - * - * Called on every change to mm->owner. mm_init_owner() does not - * invoke this routine, since it assigns the mm->owner the first time - * and does not change it. - * - * The callbacks are invoked with mmap_sem held in read mode. - */ -void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) -{ - struct cgroup *oldcgrp, *newcgrp = NULL; - - if (need_mm_owner_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - oldcgrp = task_cgroup(old, ss->subsys_id); - if (new) - newcgrp = task_cgroup(new, ss->subsys_id); - if (oldcgrp == newcgrp) - continue; - if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); - } - } -} -#endif /* CONFIG_MM_OWNER */ - /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question @@ -2834,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { write_lock(&css_set_lock); + task_lock(child); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); + task_unlock(child); write_unlock(&css_set_lock); } } @@ -2941,14 +2991,20 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, mutex_unlock(&cgroup_mutex); return 0; } + task_lock(tsk); cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); /* Pin the hierarchy */ - atomic_inc(&parent->root->sb->s_active); + if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { + /* We race with the final deactivate_super() */ + mutex_unlock(&cgroup_mutex); + return 0; + } /* Keep the cgroup alive */ get_css_set(cg); + task_unlock(tsk); mutex_unlock(&cgroup_mutex); /* Now do the VFS work to create a cgroup */ @@ -2967,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, } /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + ret = vfs_mkdir(inode, dentry, 0755); child = __d_cgrp(dentry); dput(dentry); if (ret) { @@ -2977,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, goto out_release; } - if (!child) { - printk(KERN_INFO - "Couldn't find new cgroup %s\n", nodename); - ret = -ENOMEM; - goto out_release; - } - /* The cgroup now exists. Retake cgroup_mutex and check * that we're still in the same state that we thought we * were. */ @@ -3079,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { + if ((atomic_dec_return(&css->refcnt) == 1) && + notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } diff --git a/kernel/compat.c b/kernel/compat.c index d52e2ec1deb5..42d56544460f 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -24,6 +24,7 @@ #include <linux/migrate.h> #include <linux/posix-timers.h> #include <linux/times.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> @@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } + force_successful_syscall_return(); return compat_jiffies_to_clock_t(jiffies); } @@ -894,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 47fff3b63cbf..79e40f00dcb8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -269,8 +269,11 @@ out_release: int __ref cpu_down(unsigned int cpu) { - int err = 0; + int err; + err = stop_machine_create(); + if (err) + return err; cpu_maps_update_begin(); if (cpu_hotplug_disabled) { @@ -297,6 +300,7 @@ int __ref cpu_down(unsigned int cpu) out: cpu_maps_update_done(); + stop_machine_destroy(); return err; } EXPORT_SYMBOL(cpu_down); @@ -375,8 +379,11 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error = 0; + int cpu, first_cpu, error; + error = stop_machine_create(); + if (error) + return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); /* We take down all of the non-boot CPUs in one shot to avoid races @@ -405,6 +412,7 @@ int disable_nonboot_cpus(void) printk(KERN_ERR "Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); + stop_machine_destroy(); return error; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 39c1a4c1c5a9..647c77a88fcb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -84,7 +84,7 @@ struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ - cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ + cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ struct cpuset *parent; /* my parent */ @@ -195,8 +195,6 @@ static int cpuset_mems_generation; static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), - .cpus_allowed = CPU_MASK_ALL, - .mems_allowed = NODE_MASK_ALL, }; /* @@ -240,6 +238,17 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); /* + * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist + * buffers. They are statically allocated to prevent using excess stack + * when calling cpuset_print_task_mems_allowed(). + */ +#define CPUSET_NAME_LEN (128) +#define CPUSET_NODELIST_LEN (256) +static char cpuset_name[CPUSET_NAME_LEN]; +static char cpuset_nodelist[CPUSET_NODELIST_LEN]; +static DEFINE_SPINLOCK(cpuset_buffer_lock); + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -267,7 +276,7 @@ static struct file_system_type cpuset_fs_type = { }; /* - * Return in *pmask the portion of a cpusets's cpus_allowed that + * Return in pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, @@ -280,15 +289,16 @@ static struct file_system_type cpuset_fs_type = { * Call with callback_mutex held. */ -static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) +static void guarantee_online_cpus(const struct cpuset *cs, + struct cpumask *pmask) { - while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) + while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = cs->parent; if (cs) - cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else - *pmask = cpu_online_map; - BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); + cpumask_copy(pmask, cpu_online_mask); + BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); } /* @@ -364,14 +374,9 @@ void cpuset_update_task_memory_state(void) struct task_struct *tsk = current; struct cpuset *cs; - if (task_cs(tsk) == &top_cpuset) { - /* Don't need rcu for top_cpuset. It's never freed. */ - my_cpusets_mem_gen = top_cpuset.mems_generation; - } else { - rcu_read_lock(); - my_cpusets_mem_gen = task_cs(tsk)->mems_generation; - rcu_read_unlock(); - } + rcu_read_lock(); + my_cpusets_mem_gen = task_cs(tsk)->mems_generation; + rcu_read_unlock(); if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { mutex_lock(&callback_mutex); @@ -403,12 +408,43 @@ void cpuset_update_task_memory_state(void) static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { - return cpus_subset(p->cpus_allowed, q->cpus_allowed) && + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q); } +/** + * alloc_trial_cpuset - allocate a trial cpuset + * @cs: the cpuset that the trial cpuset duplicates + */ +static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +{ + struct cpuset *trial; + + trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + if (!trial) + return NULL; + + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { + kfree(trial); + return NULL; + } + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + + return trial; +} + +/** + * free_trial_cpuset - free the trial cpuset + * @trial: the trial cpuset to be freed + */ +static void free_trial_cpuset(struct cpuset *trial) +{ + free_cpumask_var(trial->cpus_allowed); + kfree(trial); +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -458,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) c = cgroup_cs(cont); if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && - cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) + cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) return -EINVAL; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && @@ -468,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ if (cgroup_task_count(cur->css.cgroup)) { - if (cpus_empty(trial->cpus_allowed) || + if (cpumask_empty(trial->cpus_allowed) || nodes_empty(trial->mems_allowed)) { return -ENOSPC; } @@ -483,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { - return cpus_intersects(a->cpus_allowed, b->cpus_allowed); + return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); } static void @@ -508,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); - if (cpus_empty(cp->cpus_allowed)) + if (cpumask_empty(cp->cpus_allowed)) continue; if (is_sched_load_balance(cp)) @@ -575,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -static int generate_sched_domains(cpumask_t **domains, +/* FIXME: see the FIXME in partition_sched_domains() */ +static int generate_sched_domains(struct cpumask **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -583,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - cpumask_t *doms; /* resulting partition; i.e. sched domains */ + struct cpumask *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] cpumask_t slot */ + int nslot; /* next empty doms[] struct cpumask slot */ doms = NULL; dattr = NULL; @@ -594,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + doms = kmalloc(cpumask_size(), GFP_KERNEL); if (!doms) goto done; @@ -603,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - *doms = top_cpuset.cpus_allowed; + cpumask_copy(doms, top_cpuset.cpus_allowed); ndoms = 1; goto done; @@ -622,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains, cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); - if (cpus_empty(cp->cpus_allowed)) + if (cpumask_empty(cp->cpus_allowed)) continue; /* @@ -673,7 +710,7 @@ restart: * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ - doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); + doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); if (!doms) goto done; @@ -685,7 +722,7 @@ restart: for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; - cpumask_t *dp; + struct cpumask *dp; int apn = a->pn; if (apn < 0) { @@ -708,14 +745,14 @@ restart: continue; } - cpus_clear(*dp); + cpumask_clear(dp); if (dattr) *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); + cpumask_or(dp, dp, b->cpus_allowed); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -755,7 +792,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - cpumask_t *doms; + struct cpumask *doms; int ndoms; get_online_cpus(); @@ -824,7 +861,7 @@ void rebuild_sched_domains(void) static int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - return !cpus_equal(tsk->cpus_allowed, + return !cpumask_equal(&tsk->cpus_allowed, (cgroup_cs(scan->cg))->cpus_allowed); } @@ -842,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); + set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); } /** @@ -874,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) * @cs: the cpuset to consider * @buf: buffer of cpu numbers written to this cpuset */ -static int update_cpumask(struct cpuset *cs, const char *buf) +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) { struct ptr_heap heap; - struct cpuset trialcs; int retval; int is_load_balanced; @@ -885,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cs == &top_cpuset) return -EACCES; - trialcs = *cs; - /* * An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case @@ -894,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * with tasks have cpus. */ if (!*buf) { - cpus_clear(trialcs.cpus_allowed); + cpumask_clear(trialcs->cpus_allowed); } else { - retval = cpulist_parse(buf, &trialcs.cpus_allowed); + retval = cpulist_parse(buf, trialcs->cpus_allowed); if (retval < 0) return retval; - if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) return -EINVAL; } - retval = validate_change(cs, &trialcs); + retval = validate_change(cs, trialcs); if (retval < 0) return retval; /* Nothing to do if the cpus didn't change */ - if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; - is_load_balanced = is_sched_load_balance(&trialcs); + is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); - cs->cpus_allowed = trialcs.cpus_allowed; + cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); /* @@ -1006,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ fudge = 10; /* spare mmarray[] slots */ - fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ retval = -ENOMEM; /* @@ -1093,9 +1128,9 @@ done: * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ -static int update_nodemask(struct cpuset *cs, const char *buf) +static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) { - struct cpuset trialcs; nodemask_t oldmem; int retval; @@ -1106,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf) if (cs == &top_cpuset) return -EACCES; - trialcs = *cs; - /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * Since nodelist_parse() fails on an empty mask, we special case @@ -1115,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf) * with tasks have memory. */ if (!*buf) { - nodes_clear(trialcs.mems_allowed); + nodes_clear(trialcs->mems_allowed); } else { - retval = nodelist_parse(buf, trialcs.mems_allowed); + retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) goto done; - if (!nodes_subset(trialcs.mems_allowed, + if (!nodes_subset(trialcs->mems_allowed, node_states[N_HIGH_MEMORY])) return -EINVAL; } oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs.mems_allowed)) { + if (nodes_equal(oldmem, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } - retval = validate_change(cs, &trialcs); + retval = validate_change(cs, trialcs); if (retval < 0) goto done; mutex_lock(&callback_mutex); - cs->mems_allowed = trialcs.mems_allowed; + cs->mems_allowed = trialcs->mems_allowed; cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); @@ -1156,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; - if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) async_rebuild_sched_domains(); } @@ -1175,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { - struct cpuset trialcs; + struct cpuset *trialcs; int err; int balance_flag_changed; - trialcs = *cs; + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + if (turning_on) - set_bit(bit, &trialcs.flags); + set_bit(bit, &trialcs->flags); else - clear_bit(bit, &trialcs.flags); + clear_bit(bit, &trialcs->flags); - err = validate_change(cs, &trialcs); + err = validate_change(cs, trialcs); if (err < 0) - return err; + goto out; balance_flag_changed = (is_sched_load_balance(cs) != - is_sched_load_balance(&trialcs)); + is_sched_load_balance(trialcs)); mutex_lock(&callback_mutex); - cs->flags = trialcs.flags; + cs->flags = trialcs->flags; mutex_unlock(&callback_mutex); - if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); - return 0; +out: + free_trial_cpuset(trialcs); + return err; } /* @@ -1300,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +/* Protected by cgroup_lock */ +static cpumask_var_t cpus_attach; + /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct task_struct *tsk) { struct cpuset *cs = cgroup_cs(cont); + int ret = 0; - if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; - if (tsk->flags & PF_THREAD_BOUND) { - cpumask_t mask; + if (tsk->flags & PF_THREAD_BOUND) { mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; + if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) + ret = -EINVAL; mutex_unlock(&callback_mutex); - if (!cpus_equal(tsk->cpus_allowed, mask)) - return -EINVAL; } - return security_task_setscheduler(tsk, 0, NULL); + return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); } static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk) { - cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); int err; - mutex_lock(&callback_mutex); - guarantee_online_cpus(cs, &cpus); - err = set_cpus_allowed_ptr(tsk, &cpus); - mutex_unlock(&callback_mutex); + if (cs == &top_cpuset) { + cpumask_copy(cpus_attach, cpu_possible_mask); + } else { + mutex_lock(&callback_mutex); + guarantee_online_cpus(cs, cpus_attach); + mutex_unlock(&callback_mutex); + } + err = set_cpus_allowed_ptr(tsk, cpus_attach); if (err) return; @@ -1348,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, cpuset_migrate_mm(mm, &from, &to); mmput(mm); } - } /* The various types of files and directories in a cpuset file system */ @@ -1443,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *trialcs; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + switch (cft->private) { case FILE_CPULIST: - retval = update_cpumask(cgroup_cs(cgrp), buf); + retval = update_cpumask(cs, trialcs, buf); break; case FILE_MEMLIST: - retval = update_nodemask(cgroup_cs(cgrp), buf); + retval = update_nodemask(cs, trialcs, buf); break; default: retval = -EINVAL; break; } + + free_trial_cpuset(trialcs); cgroup_unlock(); return retval; } @@ -1476,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - cpumask_t mask; + int ret; mutex_lock(&callback_mutex); - mask = cs->cpus_allowed; + ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); - return cpulist_scnprintf(page, PAGE_SIZE, &mask); + return ret; } static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) @@ -1718,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, parent_cs = cgroup_cs(parent); cs->mems_allowed = parent_cs->mems_allowed; - cs->cpus_allowed = parent_cs->cpus_allowed; + cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); return; } @@ -1744,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create( cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { + kfree(cs); + return ERR_PTR(-ENOMEM); + } cpuset_update_task_memory_state(); cs->flags = 0; @@ -1752,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create( if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cpus_clear(cs->cpus_allowed); + cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); @@ -1779,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); number_of_cpusets--; + free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1802,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = { int __init cpuset_init_early(void) { + alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed); + top_cpuset.mems_generation = cpuset_mems_generation++; return 0; } @@ -1817,7 +1875,7 @@ int __init cpuset_init(void) { int err = 0; - cpus_setall(top_cpuset.cpus_allowed); + cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); @@ -1829,6 +1887,9 @@ int __init cpuset_init(void) if (err < 0) return err; + if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) + BUG(); + number_of_cpusets = 1; return 0; } @@ -1903,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * has online cpus, so can't be empty). */ parent = cs->parent; - while (cpus_empty(parent->cpus_allowed) || + while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) parent = parent->parent; @@ -1944,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpus_subset(cp->cpus_allowed, cpu_online_map) && + if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -1952,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); - cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_online_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); /* Move tasks from the empty cpuset to a parent */ - if (cpus_empty(cp->cpus_allowed) || + if (cpumask_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { @@ -1984,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - cpumask_t *doms; + struct cpumask *doms; int ndoms; switch (phase) { @@ -1999,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, } cgroup_lock(); - top_cpuset.cpus_allowed = cpu_online_map; + cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2044,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - top_cpuset.cpus_allowed = cpu_online_map; + cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); @@ -2054,15 +2116,15 @@ void __init cpuset_init_smp(void) /** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * - * Description: Returns the cpumask_t cpus_allowed of the cpuset + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_map, even if this means going outside the * tasks cpuset. **/ -void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { mutex_lock(&callback_mutex); cpuset_cpus_allowed_locked(tsk, pmask); @@ -2073,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/ -void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { task_lock(tsk); guarantee_online_cpus(task_cs(tsk), pmask); @@ -2356,6 +2418,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } +/** + * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed + * @task: pointer to task_struct of some task. + * + * Description: Prints @task's name, cpuset name, and cached copy of its + * mems_allowed to the kernel log. Must hold task_lock(task) to allow + * dereferencing task_cs(task). + */ +void cpuset_print_task_mems_allowed(struct task_struct *tsk) +{ + struct dentry *dentry; + + dentry = task_cs(tsk)->css.cgroup->dentry; + spin_lock(&cpuset_buffer_lock); + snprintf(cpuset_name, CPUSET_NAME_LEN, + dentry ? (const char *)dentry->d_name.name : "/"); + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, + tsk->mems_allowed); + printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", + tsk->comm, cpuset_name, cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); +} + /* * Collection of memory_pressure is suppressed unless * this flag is enabled by writing "1" to the special diff --git a/kernel/cred.c b/kernel/cred.c index ff7bc071991c..3a039189d707 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -372,7 +372,8 @@ int commit_creds(struct cred *new) old->fsuid != new->fsuid || old->fsgid != new->fsgid || !cap_issubset(new->cap_permitted, old->cap_permitted)) { - set_dumpable(task->mm, suid_dumpable); + if (task->mm) + set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; smp_wmb(); } @@ -506,6 +507,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) else old = get_cred(&init_cred); + *new = *old; get_uid(new->user); get_group_info(new->group_info); @@ -529,6 +531,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) error: put_cred(new); + put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index f013a0c2e111..038707404b76 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -109,20 +109,40 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) { - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + struct dma_coherent_mem *mem; int order = get_order(size); + int pageno; - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; + if (!dev) + return 0; + mem = dev->dma_mem; + if (!mem) + return 0; + if (unlikely(size > mem->size)) + return 0; + + pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); + if (pageno >= 0) { + /* + * Memory was found in the per-device arena. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *ret = mem->virt_base + (pageno << PAGE_SHIFT); + memset(*ret, 0, size); + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { + /* + * The per-device arena is exhausted and we are not + * permitted to fall back to generic memory. + */ + *ret = NULL; + } else { + /* + * The per-device arena is exhausted and we are + * permitted to fall back to generic memory. + */ + return 0; } - return (mem != NULL); + return 1; } EXPORT_SYMBOL(dma_alloc_from_coherent); diff --git a/kernel/exit.c b/kernel/exit.c index c9e5a1c14e08..c7740fa3252c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -642,35 +642,31 @@ retry: /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL, - * so that subsystems can understand the callback and take action. + * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - down_write(&mm->mmap_sem); - cgroup_mm_owner_callbacks(mm->owner, NULL); mm->owner = NULL; - up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); - read_unlock(&tasklist_lock); - down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } - cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ @@ -1055,10 +1051,7 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index 43cbf30669e6..1d68f1255dd8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; + +static int __init coredump_filter_setup(char *s) +{ + default_dump_filter = + (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & + MMF_DUMP_FILTER_MASK; + return 1; +} + +__setup("coredump_filter=", coredump_filter_setup); + #include <linux/init_task.h> static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) @@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags - : MMF_DUMP_FILTER_DEFAULT; + mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); @@ -758,7 +769,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; - if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + if (clone_flags & CLONE_SIGHAND) { atomic_inc(¤t->sighand->count); return 0; } @@ -1115,12 +1126,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (pid != &init_struct_pid) { retval = -ENOMEM; - pid = alloc_pid(task_active_pid_ns(p)); + pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(task_active_pid_ns(p)); + retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); if (retval < 0) goto bad_fork_free_pid; } @@ -1470,12 +1481,10 @@ void __init proc_caches_init(void) fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + mmap_init(); } /* diff --git a/kernel/futex.c b/kernel/futex.c index 7c6cbabe52b3..002aa189eb09 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key) */ static void drop_futex_key_refs(union futex_key *key) { - if (!key->both.ptr) + if (!key->both.ptr) { + /* If we're here then we tried to put a key we failed to get */ + WARN_ON_ONCE(1); return; + } switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: @@ -730,8 +733,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); -out: put_futex_key(fshared, &key); +out: return ret; } @@ -755,7 +758,7 @@ retryfull: goto out; ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -777,12 +780,12 @@ retry: * but we might get them from range checking */ ret = op_ret; - goto out; + goto out_put_keys; #endif if (unlikely(op_ret != -EFAULT)) { ret = op_ret; - goto out; + goto out_put_keys; } /* @@ -796,7 +799,7 @@ retry: ret = futex_handle_fault((unsigned long)uaddr2, attempt); if (ret) - goto out; + goto out_put_keys; goto retry; } @@ -834,10 +837,11 @@ retry: spin_unlock(&hb1->lock); if (hb1 != hb2) spin_unlock(&hb2->lock); -out: +out_put_keys: put_futex_key(fshared, &key2); +out_put_key1: put_futex_key(fshared, &key1); - +out: return ret; } @@ -854,13 +858,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_q *this, *next; int ret, drop_count = 0; - retry: +retry: ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -882,7 +886,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, if (!ret) goto retry; - return ret; + goto out_put_keys; } if (curval != *cmpval) { ret = -EAGAIN; @@ -927,9 +931,11 @@ out_unlock: while (--drop_count >= 0) drop_futex_key_refs(&key1); -out: +out_put_keys: put_futex_key(fshared, &key2); +out_put_key1: put_futex_key(fshared, &key1); +out: return ret; } @@ -990,7 +996,7 @@ static int unqueue_me(struct futex_q *q) int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ - retry: +retry: lock_ptr = q->lock_ptr; barrier(); if (lock_ptr != NULL) { @@ -1172,11 +1178,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; - retry: +retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) - goto out_release_sem; + goto out; hb = queue_lock(&q); @@ -1204,6 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, if (unlikely(ret)) { queue_unlock(&q, hb); + put_futex_key(fshared, &q.key); ret = get_user(uval, uaddr); @@ -1213,7 +1220,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, } ret = -EWOULDBLOCK; if (uval != val) - goto out_unlock_release_sem; + goto out_unlock_put_key; /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); @@ -1305,11 +1312,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, return -ERESTART_RESTARTBLOCK; } - out_unlock_release_sem: +out_unlock_put_key: queue_unlock(&q, hb); - - out_release_sem: put_futex_key(fshared, &q.key); + +out: return ret; } @@ -1358,16 +1365,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; - retry: +retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) - goto out_release_sem; + goto out; - retry_unlocked: +retry_unlocked: hb = queue_lock(&q); - retry_locked: +retry_locked: ret = lock_taken = 0; /* @@ -1388,14 +1395,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, */ if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { ret = -EDEADLK; - goto out_unlock_release_sem; + goto out_unlock_put_key; } /* * Surprise - we got the lock. Just return to userspace: */ if (unlikely(!curval)) - goto out_unlock_release_sem; + goto out_unlock_put_key; uval = curval; @@ -1431,7 +1438,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, * We took the lock due to owner died take over. */ if (unlikely(lock_taken)) - goto out_unlock_release_sem; + goto out_unlock_put_key; /* * We dont have the lock. Look up the PI state (or create it if @@ -1470,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, goto retry_locked; } default: - goto out_unlock_release_sem; + goto out_unlock_put_key; } } @@ -1561,16 +1568,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, destroy_hrtimer_on_stack(&to->timer); return ret != -EINTR ? ret : -ERESTARTNOINTR; - out_unlock_release_sem: +out_unlock_put_key: queue_unlock(&q, hb); - out_release_sem: +out_put_key: put_futex_key(fshared, &q.key); +out: if (to) destroy_hrtimer_on_stack(&to->timer); return ret; - uaddr_faulted: +uaddr_faulted: /* * We have to r/w *(int __user *)uaddr, and we have to modify it * atomically. Therefore, if we continue to fault after get_user() @@ -1583,7 +1591,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) - goto out_release_sem; + goto out_put_key; goto retry_unlocked; } @@ -1675,9 +1683,9 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); -out: put_futex_key(fshared, &key); +out: return ret; pi_faulted: diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index eb2bfefa6dcc..1455b7651b6b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -634,7 +634,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static void __run_hrtimer(struct hrtimer *timer); /* * When High resolution timers are active, try to reprogram. Note, that in case @@ -646,13 +645,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - /* - * XXX: recursion check? - * hrtimer_forward() should round up with timer granularity - * so that we never get into inf recursion here, - * it doesn't do that though - */ - __run_hrtimer(timer); + spin_unlock(&base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + spin_lock(&base->cpu_base->lock); return 1; } return 0; @@ -705,11 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -780,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. */ -static void enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, int reprogram) +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -814,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - if (leftmost) { - /* - * Reprogram the clock event device. When the timer is already - * expired hrtimer_enqueue_reprogram has either called the - * callback or added it to the pending list and raised the - * softirq. - * - * This is a NOP for !HIGHRES - */ - if (reprogram && hrtimer_enqueue_reprogram(timer, base)) - return; - + if (leftmost) base->first = &timer->node; - } rb_link_node(&timer->node, parent, link); rb_insert_color(&timer->node, &base->active); @@ -836,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * state of a possibly running callback. */ timer->state |= HRTIMER_STATE_ENQUEUED; + + return leftmost; } /* @@ -912,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret; + int ret, leftmost; base = lock_hrtimer_base(timer, &flags); @@ -940,12 +922,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n timer_stats_hrtimer_set_start_info(timer); + leftmost = enqueue_hrtimer(timer, new_base); + /* * Only allow reprogramming if the new base is on this CPU. * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? */ - enqueue_hrtimer(timer, new_base, - new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) + hrtimer_enqueue_reprogram(timer, new_base); unlock_hrtimer_base(timer, &flags); @@ -1157,13 +1143,13 @@ static void __run_hrtimer(struct hrtimer *timer) spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid - * reprogramming of the event hardware. This happens at the end of this - * function anyway. + * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * we do not reprogramm the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() */ if (restart != HRTIMER_NORESTART) { BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); - enqueue_hrtimer(timer, base, 0); + enqueue_hrtimer(timer, base); } timer->state &= ~HRTIMER_STATE_CALLBACK; } @@ -1243,6 +1229,22 @@ void hrtimer_interrupt(struct clock_event_device *dev) } } +/* + * local version of hrtimer_peek_ahead_timers() called with interrupts + * disabled. + */ +static void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = &__get_cpu_var(tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + /** * hrtimer_peek_ahead_timers -- run soft-expired timers now * @@ -1254,20 +1256,23 @@ void hrtimer_interrupt(struct clock_event_device *dev) */ void hrtimer_peek_ahead_timers(void) { - struct tick_device *td; unsigned long flags; - if (!hrtimer_hres_active()) - return; - local_irq_save(flags); - td = &__get_cpu_var(tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); + __hrtimer_peek_ahead_timers(); local_irq_restore(flags); } -#endif /* CONFIG_HIGH_RES_TIMERS */ +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_peek_ahead_timers(); +} + +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from timer softirq every jiffy, expire hrtimers: @@ -1513,39 +1518,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* - * Enqueue the timers on the new cpu, but do not reprogram - * the timer as that would enable a deadlock between - * hrtimer_enqueue_reprogramm() running the timer and us still - * holding a nested base lock. - * - * Instead we tickle the hrtimer interrupt after the migration - * is done, which will run all expired timers and re-programm - * the timer device. + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. */ - enqueue_hrtimer(timer, new_base, 0); + enqueue_hrtimer(timer, new_base); /* Clear the migration state bit */ timer->state &= ~HRTIMER_STATE_MIGRATE; } } -static int migrate_hrtimers(int scpu) +static void migrate_hrtimers(int scpu) { struct hrtimer_cpu_base *old_base, *new_base; - int dcpu, i; + int i; BUG_ON(cpu_online(scpu)); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &get_cpu_var(hrtimer_bases); - - dcpu = smp_processor_id(); - tick_cancel_sched_timer(scpu); + + local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = &__get_cpu_var(hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1554,15 +1556,11 @@ static int migrate_hrtimers(int scpu) } spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_var(hrtimer_bases); + spin_unlock(&new_base->lock); - return dcpu; -} - -static void tickle_timers(void *arg) -{ - hrtimer_peek_ahead_timers(); + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_enable(); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -1583,11 +1581,8 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, case CPU_DEAD: case CPU_DEAD_FROZEN: { - int dcpu; - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); - dcpu = migrate_hrtimers(scpu); - smp_call_function_single(dcpu, tickle_timers, NULL, 0); + migrate_hrtimers(scpu); break; } #endif @@ -1608,6 +1603,9 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif } /** diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index cc0f7321b8ce..1de9700f416e 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/delay.h> +#include <linux/async.h> #include "internals.h" @@ -34,6 +35,10 @@ unsigned long probe_irq_on(void) unsigned int status; int i; + /* + * quiesce the kernel, or at least the asynchronous portion + */ + async_synchronize_full(); mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to diff --git a/kernel/kmod.c b/kernel/kmod.c index b46dbb908669..a27a5f64443d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; /** * request_module - try to load a kernel module - * @fmt: printf style format string for the name of the module - * @varargs: arguements as specified in the format string + * @fmt: printf style format string for the name of the module + * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns * zero on success or a negative errno code on failure. Note that a diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9f8a3f25259a..1b9cbdc0127a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobe_enabled; -DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -115,6 +115,7 @@ enum kprobe_slot_state { SLOT_USED = 2, }; +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ static struct hlist_head kprobe_insn_pages; static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -144,10 +145,10 @@ loop_end: } /** - * get_insn_slot() - Find a slot on an executable page for an instruction. + * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t __kprobes *get_insn_slot(void) +static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) return kip->insns; } +kprobe_opcode_t __kprobes *get_insn_slot(void) +{ + kprobe_opcode_t *ret; + mutex_lock(&kprobe_insn_mutex); + ret = __get_insn_slot(); + mutex_unlock(&kprobe_insn_mutex); + return ret; +} + /* Return 1 if all garbages are collected, otherwise 0. */ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) { @@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip; struct hlist_node *pos, *next; + int safety; /* Ensure no-one is preepmted on the garbages */ - if (check_safety() != 0) + mutex_unlock(&kprobe_insn_mutex); + safety = check_safety(); + mutex_lock(&kprobe_insn_mutex); + if (safety != 0) return -EAGAIN; hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { @@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) struct kprobe_insn_page *kip; struct hlist_node *pos; + mutex_lock(&kprobe_insn_mutex); hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { @@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) collect_garbage_slots(); + + mutex_unlock(&kprobe_insn_mutex); } #endif @@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler) { + if (kp->pre_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); if (kp->pre_handler(kp, regs)) return 1; @@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->post_handler) { + if (kp->post_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); kp->post_handler(kp, regs, flags); reset_kprobe_instance(); @@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_add_head(&ri->hlist, head); } -void kretprobe_hash_lock(struct task_struct *tsk, +void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); @@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_lock(unsigned long hash, + unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) spin_unlock_irqrestore(hlist_lock, *flags); } -void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); @@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; - if (p->post_handler) + /* We don't care the kprobe which has gone. */ + if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); @@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, int ret = 0; struct kprobe *ap; + if (kprobe_gone(old_p)) { + /* + * Attempting to insert new probe at the same location that + * had a probe in the module vaddr area which already + * freed. So, the instruction slot has already been + * released. We need a new slot for the new probe. + */ + ret = arch_prepare_kprobe(old_p); + if (ret) + return ret; + } if (old_p->pre_handler == aggr_pre_handler) { copy_kprobe(old_p, p); ret = add_new_kprobe(old_p, p); + ap = old_p; } else { ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); - if (!ap) + if (!ap) { + if (kprobe_gone(old_p)) + arch_remove_kprobe(old_p); return -ENOMEM; + } add_aggr_kprobe(ap, old_p); copy_kprobe(ap, p); ret = add_new_kprobe(ap, p); } + if (kprobe_gone(old_p)) { + /* + * If the old_p has gone, its breakpoint has been disarmed. + * We have to arm it again after preparing real kprobes. + */ + ap->flags &= ~KPROBE_FLAG_GONE; + if (kprobe_enabled) + arch_arm_kprobe(ap); + } return ret; } @@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } -static int __kprobes __register_kprobe(struct kprobe *p, - unsigned long called_from) +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; struct kprobe *old_p; @@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; } - p->mod_refcounted = 0; - + p->flags = 0; /* * Check if are we probing a module. */ probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod; - calling_mod = __module_text_address(called_from); /* - * We must allow modules to probe themself and in this case - * avoid incrementing the module refcount, so as to allow - * unloading of self probing modules. + * We must hold a refcount of the probed module while updating + * its code to prohibit unexpected unloading. */ - if (calling_mod && calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } - p->mod_refcounted = 1; - } else - probed_mod = NULL; + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); + return -EINVAL; + } + /* + * If the module freed .init.text, we couldn't insert + * kprobes in there. + */ + if (within_module_init((unsigned long)p->addr, probed_mod) && + probed_mod->state != MODULE_STATE_COMING) { + module_put(probed_mod); + preempt_enable(); + return -EINVAL; + } } preempt_enable(); @@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p, out: mutex_unlock(&kprobe_mutex); - if (ret && probed_mod) + if (probed_mod) module_put(probed_mod); + return ret; } @@ -697,16 +743,16 @@ valid_p: list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are - * enabled - otherwise, the breakpoint would already have - * been removed. We save on flushing icache. + * enabled and not gone - otherwise, the breakpoint would + * already have been removed. We save on flushing icache. */ - if (kprobe_enabled) + if (kprobe_enabled && !kprobe_gone(old_p)) arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); } else { - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) old_p->break_handler = NULL; - if (p->post_handler) { + if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry_rcu(list_p, &old_p->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; @@ -721,39 +767,27 @@ noclean: static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) { - struct module *mod; struct kprobe *old_p; - if (p->mod_refcounted) { - /* - * Since we've already incremented refcount, - * we don't need to disable preemption. - */ - mod = module_text_address((unsigned long)p->addr); - if (mod) - module_put(mod); - } - - if (list_empty(&p->list) || list_is_singular(&p->list)) { - if (!list_empty(&p->list)) { - /* "p" is the last child of an aggr_kprobe */ - old_p = list_entry(p->list.next, struct kprobe, list); - list_del(&p->list); - kfree(old_p); - } + if (list_empty(&p->list)) arch_remove_kprobe(p); + else if (list_is_singular(&p->list)) { + /* "p" is the last child of an aggr_kprobe */ + old_p = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); + arch_remove_kprobe(old_p); + kfree(old_p); } } -static int __register_kprobes(struct kprobe **kps, int num, - unsigned long called_from) +int __kprobes register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kprobe(kps[i], called_from); + ret = register_kprobe(kps[i]); if (ret < 0) { if (i > 0) unregister_kprobes(kps, i); @@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num, return ret; } -/* - * Registration and unregistration functions for kprobe. - */ -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobes(&p, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } -int __kprobes register_kprobes(struct kprobe **kps, int num) -{ - return __register_kprobes(kps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobes(struct kprobe **kps, int num) { int i; @@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -static int __register_jprobes(struct jprobe **jps, int num, - unsigned long called_from) +int __kprobes register_jprobes(struct jprobe **jps, int num) { struct jprobe *jp; int ret = 0, i; @@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num, /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; - ret = __register_kprobe(&jp->kp, called_from); + ret = register_kprobe(&jp->kp); } if (ret < 0) { if (i > 0) @@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num, int __kprobes register_jprobe(struct jprobe *jp) { - return __register_jprobes(&jp, 1, - (unsigned long)__builtin_return_address(0)); + return register_jprobes(&jp, 1); } void __kprobes unregister_jprobe(struct jprobe *jp) @@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp) unregister_jprobes(&jp, 1); } -int __kprobes register_jprobes(struct jprobe **jps, int num) -{ - return __register_jprobes(jps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_jprobes(struct jprobe **jps, int num) { int i; @@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -static int __kprobes __register_kretprobe(struct kretprobe *rp, - unsigned long called_from) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, rp->nmissed = 0; /* Establish function entry probe point */ - ret = __register_kprobe(&rp->kp, called_from); + ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); return ret; } -static int __register_kretprobes(struct kretprobe **rps, int num, - unsigned long called_from) +int __kprobes register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kretprobe(rps[i], called_from); + ret = register_kretprobe(rps[i]); if (ret < 0) { if (i > 0) unregister_kretprobes(rps, i); @@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num, return ret; } -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - return __register_kretprobes(&rp, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - return __register_kretprobes(rps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) { int i; @@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, #endif /* CONFIG_KRETPROBES */ +/* Set the kprobe gone and remove its instruction buffer. */ +static void __kprobes kill_kprobe(struct kprobe *p) +{ + struct kprobe *kp; + p->flags |= KPROBE_FLAG_GONE; + if (p->pre_handler == aggr_pre_handler) { + /* + * If this is an aggr_kprobe, we have to list all the + * chained probes and mark them GONE. + */ + list_for_each_entry_rcu(kp, &p->list, list) + kp->flags |= KPROBE_FLAG_GONE; + p->post_handler = NULL; + p->break_handler = NULL; + } + /* + * Here, we can remove insn_slot safely, because no thread calls + * the original probed function (which will be freed soon) any more. + */ + arch_remove_kprobe(p); +} + +/* Module notifier call back, checking kprobes on the module */ +static int __kprobes kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + int checkcore = (val == MODULE_STATE_GOING); + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) + return NOTIFY_DONE; + + /* + * When MODULE_STATE_GOING was notified, both of module .text and + * .init.text sections would be freed. When MODULE_STATE_LIVE was + * notified, only .init.text section would be freed. We need to + * disable kprobes which have been inserted in the sections. + */ + mutex_lock(&kprobe_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (within_module_init((unsigned long)p->addr, mod) || + (checkcore && + within_module_core((unsigned long)p->addr, mod))) { + /* + * The vaddr this probe is installed will soon + * be vfreed buy not synced to disk. Hence, + * disarming the breakpoint isn't needed. + */ + kill_kprobe(p); + } + } + mutex_unlock(&kprobe_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block kprobe_module_nb = { + .notifier_call = kprobes_module_callback, + .priority = 0 +}; + static int __init init_kprobes(void) { int i, err = 0; @@ -1111,6 +1174,9 @@ static int __init init_kprobes(void) err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); + if (!err) + err = register_module_notifier(&kprobe_module_nb); + kprobes_initialized = (err == 0); if (!err) @@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; if (sym) - seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, - sym, offset, (modname ? modname : " ")); + seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, + sym, offset, (modname ? modname : " "), + (kprobe_gone(p) ? "[GONE]" : "")); else - seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); + seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, + (kprobe_gone(p) ? "[GONE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) - arch_arm_kprobe(p); + if (!kprobe_gone(p)) + arch_arm_kprobe(p); } kprobe_enabled = true; @@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!arch_trampoline_kprobe(p)) + if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) arch_disarm_kprobe(p); } } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 08dd8ed86c77..528dd78e7e7e 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static struct kobj_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#if defined(CONFIG_HOTPLUG) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -137,7 +137,7 @@ struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#if defined(CONFIG_HOTPLUG) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif diff --git a/kernel/module.c b/kernel/module.c index dd2a54155b54..c9332c90d5a0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,7 +43,6 @@ #include <linux/device.h> #include <linux/string.h> #include <linux/mutex.h> -#include <linux/unwind.h> #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -51,6 +50,7 @@ #include <asm/sections.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> +#include <linux/async.h> #if 0 #define DEBUGP printk @@ -757,8 +757,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags) return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; + /* Create stop_machine threads since free_module relies on + * a non-failing stop_machine call. */ + ret = stop_machine_create(); + if (ret) + return ret; + + if (mutex_lock_interruptible(&module_mutex) != 0) { + ret = -EINTR; + goto out_stop; + } mod = find_module(name); if (!mod) { @@ -809,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + async_synchronize_full(); mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); @@ -817,10 +826,12 @@ sys_delete_module(const char __user *name_user, unsigned int flags) out: mutex_unlock(&module_mutex); +out_stop: + stop_machine_destroy(); return ret; } -static void print_unload_info(struct seq_file *m, struct module *mod) +static inline void print_unload_info(struct seq_file *m, struct module *mod) { struct module_use *use; int printed_something = 0; @@ -893,7 +904,7 @@ void module_put(struct module *module) EXPORT_SYMBOL(module_put); #else /* !CONFIG_MODULE_UNLOAD */ -static void print_unload_info(struct seq_file *m, struct module *mod) +static inline void print_unload_info(struct seq_file *m, struct module *mod) { /* We don't know the usage count, or what modules are using. */ seq_printf(m, " - -"); @@ -1439,8 +1450,6 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); - unwind_remove_table(mod->unwind_info, 0); - /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1578,11 +1587,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs, return ret; } +/* Additional bytes needed by arch in front of individual sections */ +unsigned int __weak arch_mod_section_prepend(struct module *mod, + unsigned int section) +{ + /* default implementation just returns zero */ + return 0; +} + /* Update size with this section: return offset. */ -static long get_offset(unsigned int *size, Elf_Shdr *sechdr) +static long get_offset(struct module *mod, unsigned int *size, + Elf_Shdr *sechdr, unsigned int section) { long ret; + *size += arch_mod_section_prepend(mod, section); ret = ALIGN(*size, sechdr->sh_addralign ?: 1); *size = ret + sechdr->sh_size; return ret; @@ -1622,7 +1641,7 @@ static void layout_sections(struct module *mod, || strncmp(secstrings + s->sh_name, ".init", 5) == 0) continue; - s->sh_entsize = get_offset(&mod->core_size, s); + s->sh_entsize = get_offset(mod, &mod->core_size, s, i); DEBUGP("\t%s\n", secstrings + s->sh_name); } if (m == 0) @@ -1640,7 +1659,7 @@ static void layout_sections(struct module *mod, || strncmp(secstrings + s->sh_name, ".init", 5) != 0) continue; - s->sh_entsize = (get_offset(&mod->init_size, s) + s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); DEBUGP("\t%s\n", secstrings + s->sh_name); } @@ -1725,15 +1744,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name, return NULL; } -static int is_exported(const char *name, const struct module *mod) +static int is_exported(const char *name, unsigned long value, + const struct module *mod) { - if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) - return 1; + const struct kernel_symbol *ks; + if (!mod) + ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); else - if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) - return 1; - else - return 0; + ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); + return ks != NULL && ks->value == value; } /* As per nm */ @@ -1847,7 +1866,6 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int unwindex = 0; unsigned int num_kp, num_mcount; struct kernel_param *kp; struct module *mod; @@ -1865,6 +1883,13 @@ static noinline struct module *load_module(void __user *umod, /* vmalloc barfs on "unusual" numbers. Check here */ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) return ERR_PTR(-ENOMEM); + + /* Create stop_machine threads since the error path relies on + * a non-failing stop_machine call. */ + err = stop_machine_create(); + if (err) + goto free_hdr; + if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; goto free_hdr; @@ -1930,9 +1955,6 @@ static noinline struct module *load_module(void __user *umod, versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); -#ifdef ARCH_UNWIND_SECTION_NAME - unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); -#endif /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1942,8 +1964,6 @@ static noinline struct module *load_module(void __user *umod, sechdrs[symindex].sh_flags |= SHF_ALLOC; sechdrs[strindex].sh_flags |= SHF_ALLOC; #endif - if (unwindex) - sechdrs[unwindex].sh_flags |= SHF_ALLOC; /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2240,14 +2260,10 @@ static noinline struct module *load_module(void __user *umod, add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - /* Size of section 0 is 0, so this works well if no unwind info. */ - mod->unwind_info = unwind_add_table(mod, - (void *)sechdrs[unwindex].sh_addr, - sechdrs[unwindex].sh_size); - /* Get rid of temporary copy */ vfree(hdr); + stop_machine_destroy(); /* Done! */ return mod; @@ -2270,6 +2286,7 @@ static noinline struct module *load_module(void __user *umod, kfree(args); free_hdr: vfree(hdr); + stop_machine_destroy(); return ERR_PTR(err); truncated: @@ -2337,11 +2354,12 @@ sys_init_module(void __user *umod, /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; wake_up(&module_wq); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); - unwind_remove_table(mod->unwind_info, 1); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2376,7 +2394,7 @@ static const char *get_ksymbol(struct module *mod, unsigned long nextval; /* At worse, next value is at end of module */ - if (within(addr, mod->module_init, mod->init_size)) + if (within_module_init(addr, mod)) nextval = (unsigned long)mod->module_init+mod->init_text_size; else nextval = (unsigned long)mod->module_core+mod->core_text_size; @@ -2424,8 +2442,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) - || within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -2447,8 +2465,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -2471,8 +2489,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -2504,7 +2522,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); - *exported = is_exported(name, mod); + *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } @@ -2691,7 +2709,7 @@ int is_module_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_core, mod->core_size)) { + if (within_module_core(addr, mod)) { preempt_enable(); return 1; } diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 43c2111cd54d..78bc3fdac0d2 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -13,7 +13,6 @@ struct ns_cgroup { struct cgroup_subsys_state css; - spinlock_t lock; }; struct cgroup_subsys ns_subsys; @@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); if (!ns_cgroup) return ERR_PTR(-ENOMEM); - spin_lock_init(&ns_cgroup->lock); return &ns_cgroup->css; } diff --git a/kernel/panic.c b/kernel/panic.c index 13f06349a786..2a2ff36ff44d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -299,6 +299,8 @@ static int init_oops_id(void) { if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); + else + oops_id++; return 0; } diff --git a/kernel/pid.c b/kernel/pid.c index 064e76afa507..1b3586fe753a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -474,8 +474,14 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) } EXPORT_SYMBOL(task_session_nr_ns); +struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) +{ + return ns_of_pid(task_pid(tsk)); +} +EXPORT_SYMBOL_GPL(task_active_pid_ns); + /* - * Used by proc to find the first pid that is greater then or equal to nr. + * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f77d3819ef57..45e8541ab7e3 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -258,12 +258,12 @@ int hibernation_snapshot(int platform_mode) { int error; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + error = platform_begin(platform_mode); if (error) return error; - error = platform_begin(platform_mode); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); if (error) goto Close; diff --git a/kernel/power/main.c b/kernel/power/main.c index 613f16941b85..239988873971 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) /* this may fail if the RTC hasn't been initialized */ status = rtc_read_time(rtc, &alm.time); if (status < 0) { - printk(err_readtime, rtc->dev.bus_id, status); + printk(err_readtime, dev_name(&rtc->dev), status); return; } rtc_tm_to_time(&alm.time, &now); @@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) status = rtc_set_alarm(rtc, &alm); if (status < 0) { - printk(err_wakealarm, rtc->dev.bus_id, status); + printk(err_wakealarm, dev_name(&rtc->dev), status); return; } @@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) if (!device_may_wakeup(candidate->dev.parent)) return 0; - *(char **)name_ptr = dev->bus_id; + *(const char **)name_ptr = dev_name(dev); return 1; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5d2ab836e998..f5fc2d7680f2 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -25,6 +25,7 @@ #include <linux/syscalls.h> #include <linux/console.h> #include <linux/highmem.h> +#include <linux/list.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -static void chain_free(struct chain_allocator *ca, int clear_page_nosave) -{ - free_list_of_pages(ca->chain, clear_page_nosave); - memset(ca, 0, sizeof(struct chain_allocator)); -} - /** * Data types related to memory bitmaps. * @@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) struct bm_block { - struct bm_block *next; /* next element of the list */ + struct list_head hook; /* hook into a list of bitmap blocks */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ unsigned long *data; /* bitmap representing pages */ @@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb) return bb->end_pfn - bb->start_pfn; } -struct zone_bitmap { - struct zone_bitmap *next; /* next element of the list */ - unsigned long start_pfn; /* minimal pfn in this zone */ - unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ - struct bm_block *bm_blocks; /* list of bitmap blocks */ - struct bm_block *cur_block; /* recently used bitmap block */ -}; - /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct zone_bitmap *zone_bm; struct bm_block *block; int bit; }; struct memory_bitmap { - struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ + struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -273,11 +259,7 @@ struct memory_bitmap { static void memory_bm_position_reset(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; - - zone_bm = bm->zone_bm_list; - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; + bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); bm->cur.bit = 0; } @@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects + * @nr_blocks - number of blocks to allocate + * @list - list to put the allocated blocks into + * @ca - chain allocator to be used for allocating memory */ - -static inline struct bm_block * -create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) +static int create_bm_block_list(unsigned long pages, + struct list_head *list, + struct chain_allocator *ca) { - struct bm_block *bblist = NULL; + unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); while (nr_blocks-- > 0) { struct bm_block *bb; bb = chain_alloc(ca, sizeof(struct bm_block)); if (!bb) - return NULL; - - bb->next = bblist; - bblist = bb; + return -ENOMEM; + list_add(&bb->hook, list); } - return bblist; + + return 0; } +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + /** - * create_zone_bm_list - create a list of zone bitmap objects + * free_mem_extents - free a list of memory extents + * @list - list of extents to empty */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; -static inline struct zone_bitmap * -create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} + +/** + * create_mem_extents - create a list of memory extents representing + * contiguous ranges of PFNs + * @list - list to put the extents into + * @gfp_mask - mask to use for memory allocations + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { - struct zone_bitmap *zbmlist = NULL; + struct zone *zone; - while (nr_zones-- > 0) { - struct zone_bitmap *zbm; + INIT_LIST_HEAD(list); - zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); - if (!zbm) - return NULL; + for_each_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + if (!populated_zone(zone)) + continue; - zbm->next = zbmlist; - zbmlist = zbm; + zone_start = zone->zone_start_pfn; + zone_end = zone->zone_start_pfn + zone->spanned_pages; + + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; + + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } } - return zbmlist; + + return 0; } /** * memory_bm_create - allocate memory for a memory bitmap */ - static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) { struct chain_allocator ca; - struct zone *zone; - struct zone_bitmap *zone_bm; - struct bm_block *bb; - unsigned int nr; + struct list_head mem_extents; + struct mem_extent *ext; + int error; chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->blocks); - /* Compute the number of zones */ - nr = 0; - for_each_zone(zone) - if (populated_zone(zone)) - nr++; - - /* Allocate the list of zones bitmap objects */ - zone_bm = create_zone_bm_list(nr, &ca); - bm->zone_bm_list = zone_bm; - if (!zone_bm) { - chain_free(&ca, PG_UNSAFE_CLEAR); - return -ENOMEM; - } - - /* Initialize the zone bitmap objects */ - for_each_zone(zone) { - unsigned long pfn; + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; - if (!populated_zone(zone)) - continue; + list_for_each_entry(ext, &mem_extents, hook) { + struct bm_block *bb; + unsigned long pfn = ext->start; + unsigned long pages = ext->end - ext->start; - zone_bm->start_pfn = zone->zone_start_pfn; - zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* Allocate the list of bitmap block objects */ - nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - bb = create_bm_block_list(nr, &ca); - zone_bm->bm_blocks = bb; - zone_bm->cur_block = bb; - if (!bb) - goto Free; + bb = list_entry(bm->blocks.prev, struct bm_block, hook); - nr = zone->spanned_pages; - pfn = zone->zone_start_pfn; - /* Initialize the bitmap block objects */ - while (bb) { - unsigned long *ptr; + error = create_bm_block_list(pages, bm->blocks.prev, &ca); + if (error) + goto Error; - ptr = get_image_page(gfp_mask, safe_needed); - bb->data = ptr; - if (!ptr) - goto Free; + list_for_each_entry_continue(bb, &bm->blocks, hook) { + bb->data = get_image_page(gfp_mask, safe_needed); + if (!bb->data) { + error = -ENOMEM; + goto Error; + } bb->start_pfn = pfn; - if (nr >= BM_BITS_PER_BLOCK) { + if (pages >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - nr -= BM_BITS_PER_BLOCK; + pages -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ - pfn += nr; + pfn += pages; } bb->end_pfn = pfn; - bb = bb->next; } - zone_bm = zone_bm->next; } + bm->p_list = ca.chain; memory_bm_position_reset(bm); - return 0; + Exit: + free_mem_extents(&mem_extents); + return error; - Free: + Error: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); - return -ENOMEM; + goto Exit; } /** * memory_bm_free - free memory occupied by the memory bitmap @bm */ - static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct zone_bitmap *zone_bm; + struct bm_block *bb; - /* Free the list of bit blocks for each zone_bitmap object */ - zone_bm = bm->zone_bm_list; - while (zone_bm) { - struct bm_block *bb; + list_for_each_entry(bb, &bm->blocks, hook) + if (bb->data) + free_image_page(bb->data, clear_nosave_free); - bb = zone_bm->bm_blocks; - while (bb) { - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - bb = bb->next; - } - zone_bm = zone_bm->next; - } free_list_of_pages(bm->p_list, clear_nosave_free); - bm->zone_bm_list = NULL; + + INIT_LIST_HEAD(&bm->blocks); } /** @@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. */ - static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - /* Check if the pfn is from the current zone */ - zone_bm = bm->cur.zone_bm; - if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = bm->zone_bm_list; - /* We don't assume that the zones are sorted by pfns */ - while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = zone_bm->next; - - if (!zone_bm) - return -EFAULT; - } - bm->cur.zone_bm = zone_bm; - } - /* Check if the pfn corresponds to the current bitmap block */ - bb = zone_bm->cur_block; + /* + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ + bb = bm->cur.block; if (pfn < bb->start_pfn) - bb = zone_bm->bm_blocks; + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) + break; - while (pfn >= bb->end_pfn) { - bb = bb->next; + if (pfn >= bb->end_pfn) + list_for_each_entry_continue(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn && pfn < bb->end_pfn) + break; - BUG_ON(!bb); - } - zone_bm->cur_block = bb; + if (&bb->hook == &bm->blocks) + return -EFAULT; + + /* The block has been found */ + bm->cur.block = bb; pfn -= bb->start_pfn; + bm->cur.bit = pfn + 1; *bit_nr = pfn; *addr = bb->data; return 0; @@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + return !memory_bm_find_bit(bm, pfn, &addr, &bit); +} + /** * memory_bm_next_pfn - find the pfn that corresponds to the next set bit * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is @@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; struct bm_block *bb; int bit; + bb = bm->cur.block; do { - bb = bm->cur.block; - do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = bb->next; - bm->cur.block = bb; - bm->cur.bit = 0; - } while (bb); - zone_bm = bm->cur.zone_bm->next; - if (zone_bm) { - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - bm->cur.bit = 0; - } - } while (zone_bm); + bit = bm->cur.bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); + bm->cur.block = bb; + bm->cur.bit = 0; + } while (&bb->hook != &bm->blocks); + memory_bm_position_reset(bm); return BM_END_OF_MAP; @@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void) * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ - -static struct page *saveable_highmem_page(unsigned long pfn) +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(!PageHighMem(page)); @@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_highmem_page(pfn)) + if (saveable_highmem_page(zone, pfn)) n++; } return n; } #else -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} #endif /* CONFIG_HIGHMEM */ /** @@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ - -static struct page *saveable_page(unsigned long pfn) +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(PageHighMem(page)); @@ -903,7 +918,7 @@ unsigned int count_data_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if(saveable_page(pfn)) + if (saveable_page(zone, pfn)) n++; } return n; @@ -944,7 +959,7 @@ static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? - saveable_highmem_page(pfn) : saveable_page(pfn); + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) @@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); + dst = kmap_atomic(d_page, KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { @@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) } } #else -#define page_is_saveable(zone, pfn) saveable_page(pfn) +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { @@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info) * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set * the corresponding bit in the memory bitmap @bm */ - -static inline void -unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - memory_bm_set_bit(bm, buf[j]); + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); + else + return -EFAULT; } + + return 0; } /* List of "safe" pages that may be used to store data loaded from the suspend @@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_page = page; if (safe_highmem_pages > 0) { @@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) static inline void * get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) { - return NULL; + return ERR_PTR(-EINVAL); } static inline void copy_last_highmem_page(void) {} @@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; - struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); if (PageHighMem(page)) return get_highmem_page_buffer(page, ca); @@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); pbe->address = safe_pages_list; @@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return error; } else if (handle->prev <= nr_meta_pages) { - unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; + if (handle->prev == nr_meta_pages) { error = prepare_image(&orig_bm, ©_bm); if (error) @@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; - if (!handle->buffer) - return -ENOMEM; + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); } } else { copy_last_highmem_page(); handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); if (handle->buffer != buffer) handle->sync_read = 0; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 023ff2a31d89..a92c91451559 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -262,3 +262,125 @@ int swsusp_shrink_memory(void) return 0; } + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * hibernation and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * hibernate_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int hibernate_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * hibernate_nvs_free - free data pages allocated for saving NVS regions + */ +void hibernate_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int hibernate_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + hibernate_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * hibernate_nvs_save - save NVS memory regions + */ +void hibernate_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * hibernate_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void hibernate_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/printk.c b/kernel/printk.c index e651ab05655f..7015733793e8 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) static const char recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int recursion_bug; - static int new_text_line = 1; +static int new_text_line = 1; static char printk_buf[1024]; asmlinkage int vprintk(const char *fmt, va_list args) diff --git a/kernel/profile.c b/kernel/profile.c index d18e2d2654f2..784933acf5b8 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -445,7 +445,6 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <asm/uaccess.h> -#include <asm/ptrace.h> static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ad63af8b2521..d92a76a881aa 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void synchronize_rcu(void); /* Makes kernel-doc tools happy */ -synchronize_rcu_xxx(synchronize_rcu, call_rcu) +void synchronize_rcu(void) +{ + struct rcu_synchronize rcu; + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} EXPORT_SYMBOL_GPL(synchronize_rcu); static void rcu_barrier_callback(struct rcu_head *notused) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index f9dc8f3720f6..33cfc50781f9 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1177,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * in -rt this does -not- necessarily result in all currently executing * interrupt -handlers- having completed. */ -synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) +void __synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} EXPORT_SYMBOL_GPL(__synchronize_sched); /* diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 3245b40952c6..1cff28db56b6 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -136,7 +136,7 @@ static int stutter_pause_test = 0; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ +#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */ #define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ static int fullstop; /* stop generating callbacks at test end. */ DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ @@ -151,12 +151,10 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, { if (fullstop) return NOTIFY_DONE; - if (signal_pending(current)) { - mutex_lock(&fullstop_mutex); - if (!ACCESS_ONCE(fullstop)) - fullstop = FULLSTOP_SIGNALED; - mutex_unlock(&fullstop_mutex); - } + mutex_lock(&fullstop_mutex); + if (!fullstop) + fullstop = FULLSTOP_SHUTDOWN; + mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; } @@ -624,7 +622,7 @@ rcu_torture_writer(void *arg) rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } @@ -649,7 +647,7 @@ rcu_torture_fakewriter(void *arg) } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } @@ -759,7 +757,7 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); if (irqreader && cur_ops->irqcapable) del_timer_sync(&t); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a342b032112c..f2d8638e6c60 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); #ifdef CONFIG_NO_HZ -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); +DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = 1, + .dynticks = 1, +}; #endif /* #ifdef CONFIG_NO_HZ */ static int blimit = 10; /* Maximum callbacks per softirq. */ @@ -572,6 +575,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { rnp->qsmask = rnp->qsmaskinit; + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -1379,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) static void __cpuinit rcu_online_cpu(int cpu) { -#ifdef CONFIG_NO_HZ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdtp->dynticks_nesting = 1; - rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */ - rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1; -#endif /* #ifdef CONFIG_NO_HZ */ rcu_init_percpu_data(cpu, &rcu_state); rcu_init_percpu_data(cpu, &rcu_bh_state); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index f275c8eca772..bf8e7534c803 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -15,10 +15,11 @@ #include <linux/uaccess.h> #include <linux/mm.h> -void res_counter_init(struct res_counter *counter) +void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = (unsigned long long)LLONG_MAX; + counter->parent = parent; } int res_counter_charge_locked(struct res_counter *counter, unsigned long val) @@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) return 0; } -int res_counter_charge(struct res_counter *counter, unsigned long val) +int res_counter_charge(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) { int ret; unsigned long flags; - - spin_lock_irqsave(&counter->lock, flags); - ret = res_counter_charge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); + struct res_counter *c, *u; + + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + ret = res_counter_charge_locked(c, val); + spin_unlock(&c->lock); + if (ret < 0) { + *limit_fail_at = c; + goto undo; + } + } + ret = 0; + goto done; +undo: + for (u = counter; u != c; u = u->parent) { + spin_lock(&u->lock); + res_counter_uncharge_locked(u, val); + spin_unlock(&u->lock); + } +done: + local_irq_restore(flags); return ret; } @@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) void res_counter_uncharge(struct res_counter *counter, unsigned long val) { unsigned long flags; + struct res_counter *c; - spin_lock_irqsave(&counter->lock, flags); - res_counter_uncharge_locked(counter, val); - spin_unlock_irqrestore(&counter->lock, flags); + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + res_counter_uncharge_locked(c, val); + spin_unlock(&c->lock); + } + local_irq_restore(flags); } diff --git a/kernel/resource.c b/kernel/resource.c index e633106b12f6..ca6a1536b205 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res) */ struct resource * __request_region(struct resource *parent, resource_size_t start, resource_size_t n, - const char *name) + const char *name, int flags) { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); @@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent, res->start = start; res->end = start + n - 1; res->flags = IORESOURCE_BUSY; + res->flags |= flags; write_lock(&resource_lock); @@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start, { struct resource * res; - res = __request_region(parent, start, n, "check-region"); + res = __request_region(parent, start, n, "check-region", 0); if (!res) return -EBUSY; @@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev, dr->start = start; dr->n = n; - res = __request_region(parent, start, n, name); + res = __request_region(parent, start, n, name, 0); if (res) devres_add(dev, dr); else @@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) return err; } + +#ifdef CONFIG_STRICT_DEVMEM +static int strict_iomem_checks = 1; +#else +static int strict_iomem_checks; +#endif + +/* + * check if an address is reserved in the iomem resource tree + * returns 1 if reserved, 0 if not reserved. + */ +int iomem_is_exclusive(u64 addr) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + int size = PAGE_SIZE; + + if (!strict_iomem_checks) + return 0; + + addr = addr & PAGE_MASK; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + break; + if (p->end < addr) + continue; + if (p->flags & IORESOURCE_BUSY && + p->flags & IORESOURCE_EXCLUSIVE) { + err = 1; + break; + } + } + read_unlock(&resource_lock); + + return err; +} + +static int __init strict_iomem(char *str) +{ + if (strstr(str, "relaxed")) + strict_iomem_checks = 0; + if (strstr(str, "strict")) + strict_iomem_checks = 1; + return 1; +} + +__setup("iomem=", strict_iomem); diff --git a/kernel/sched.c b/kernel/sched.c index 545c6fccd1dc..deb5ac8c12f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3728,8 +3728,13 @@ redo: } double_unlock_balance(this_rq, busiest); + /* + * Should not call ttwu while holding a rq->lock + */ + spin_unlock(&this_rq->lock); if (active_balance) wake_up_process(busiest->migration_thread); + spin_lock(&this_rq->lock); } else sd->nr_balance_failed = 0; @@ -6957,7 +6962,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) spin_unlock_irqrestore(&rq->lock, flags); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); @@ -6970,7 +6975,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) } if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto free_rd; + goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) @@ -6986,8 +6991,7 @@ free_online: free_cpumask_var(rd->online); free_span: free_cpumask_var(rd->span); -free_rd: - kfree(rd); +out: return -ENOMEM; } @@ -7987,7 +7991,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static void arch_reinit_sched_domains(void) { get_online_cpus(); @@ -7996,13 +8000,10 @@ int arch_reinit_sched_domains(void) rebuild_sched_domains(); put_online_cpus(); - - return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { - int ret; unsigned int level = 0; if (sscanf(buf, "%u", &level) != 1) @@ -8023,9 +8024,9 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - ret = arch_reinit_sched_domains(); + arch_reinit_sched_domains(); - return ret ? ret : count; + return count; } #ifdef CONFIG_SCHED_MC @@ -8060,7 +8061,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_store); #endif -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) { int err = 0; diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096ddfe3..a0b0852414cc 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); @@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); */ void sched_clock_idle_wakeup_event(u64 delta_ns) { + if (timekeeping_suspended) + return; + sched_clock_tick(); touch_softlockup_watchdog(); } diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 018b7be1db2e..1e00bfacf9b8 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -151,7 +151,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int cpupri_init(struct cpupri *cp, bool bootmem) +int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) { int i; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 56c0efe902a7..8e1352c75557 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* - * delta *= P[w / rw] - */ -static inline unsigned long -calc_delta_weight(unsigned long delta, struct sched_entity *se) -{ - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - se->load.weight, &cfs_rq_of(se)->load); - } - - return delta; -} - -/* * delta /= w */ static inline unsigned long @@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; + u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + + for_each_sched_entity(se) { + struct load_weight *load = &cfs_rq->load; - if (unlikely(!se->on_rq)) - nr_running++; + if (unlikely(!se->on_rq)) { + struct load_weight lw = cfs_rq->load; - return calc_delta_weight(__sched_period(nr_running), se); + update_load_add(&lw, se->load.weight); + load = &lw; + } + slice = calc_delta_mine(slice, se->load.weight, load); + } + return slice; } /* @@ -1623,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } } -#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) - /* * Share the fairness runtime between parent and child, thus the * total amount of pressure for CPU stays equal - new tasks diff --git a/kernel/signal.c b/kernel/signal.c index 8e95855ff3cf..3152ac3b62e2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = task_pid_vnr(current); + q->info.si_pid = task_tgid_nr_ns(current, + task_active_pid_ns(t)); q->info.si_uid = current_uid(); break; case (unsigned long) SEND_SIG_PRIV: diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 286c41722e8c..0cd415ee62a2 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -38,7 +38,10 @@ struct stop_machine_data { static unsigned int num_threads; static atomic_t thread_ack; static DEFINE_MUTEX(lock); - +/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */ +static DEFINE_MUTEX(setup_lock); +/* Users of stop_machine. */ +static int refcount; static struct workqueue_struct *stop_machine_wq; static struct stop_machine_data active, idle; static const cpumask_t *active_cpus; @@ -109,6 +112,43 @@ static int chill(void *unused) return 0; } +int stop_machine_create(void) +{ + mutex_lock(&setup_lock); + if (refcount) + goto done; + stop_machine_wq = create_rt_workqueue("kstop"); + if (!stop_machine_wq) + goto err_out; + stop_machine_work = alloc_percpu(struct work_struct); + if (!stop_machine_work) + goto err_out; +done: + refcount++; + mutex_unlock(&setup_lock); + return 0; + +err_out: + if (stop_machine_wq) + destroy_workqueue(stop_machine_wq); + mutex_unlock(&setup_lock); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(stop_machine_create); + +void stop_machine_destroy(void) +{ + mutex_lock(&setup_lock); + refcount--; + if (refcount) + goto done; + destroy_workqueue(stop_machine_wq); + free_percpu(stop_machine_work); +done: + mutex_unlock(&setup_lock); +} +EXPORT_SYMBOL_GPL(stop_machine_destroy); + int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { struct work_struct *sm_work; @@ -146,19 +186,14 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { int ret; + ret = stop_machine_create(); + if (ret) + return ret; /* No CPUs can come up or down during this. */ get_online_cpus(); ret = __stop_machine(fn, data, cpus); put_online_cpus(); - + stop_machine_destroy(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); - -static int __init stop_machine_init(void) -{ - stop_machine_wq = create_rt_workqueue("kstop"); - stop_machine_work = alloc_percpu(struct work_struct); - return 0; -} -core_initcall(stop_machine_init); diff --git a/kernel/sys.c b/kernel/sys.c index d356d79e84ac..763c3c17ded3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> +#include <linux/ptrace.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf) if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } + force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } @@ -1627,6 +1629,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { + utime = task_utime(current); + stime = task_stime(current); accumulate_thread_rusage(p, r); goto out; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff6d45c7626f..89d74436318c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,15 +82,14 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifndef CONFIG_MMU +extern int sysctl_nr_trim_pages; +#endif #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) -static int one = 1; -#endif - #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; static int neg_one = -1; @@ -101,6 +100,7 @@ static int two = 2; #endif static int zero; +static int one = 1; static int one_hundred = 100; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -952,12 +952,22 @@ static struct ctl_table vm_table[] = { .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &dirty_background_ratio_handler, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = &dirty_background_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", .data = &vm_dirty_ratio, @@ -969,6 +979,16 @@ static struct ctl_table vm_table[] = { .extra2 = &one_hundred, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = &dirty_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), @@ -1085,6 +1105,17 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#else + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, #endif { .ctl_name = VM_LAPTOP_MODE, diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 06b6395b45b2..4f104515a19b 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -22,21 +22,11 @@ static u32 rand1, preh_val, posth_val, jph_val; static int errors, handler_errors, num_tests; +static u32 (*target)(u32 value); +static u32 (*target2)(u32 value); static noinline u32 kprobe_target(u32 value) { - /* - * gcc ignores noinline on some architectures unless we stuff - * sufficient lard into the function. The get_kprobe() here is - * just for that. - * - * NOTE: We aren't concerned about the correctness of get_kprobe() - * here; hence, this call is neither under !preempt nor with the - * kprobe_mutex held. This is fine(tm) - */ - if (get_kprobe((void *)0xdeadbeef)) - printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); - return (value / div_factor); } @@ -74,7 +64,7 @@ static int test_kprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kprobe(&kp); if (preh_val == 0) { @@ -92,6 +82,84 @@ static int test_kprobe(void) return 0; } +static noinline u32 kprobe_target2(u32 value) +{ + return (value / div_factor) + 1; +} + +static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) +{ + preh_val = (rand1 / div_factor) + 1; + return 0; +} + +static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + if (preh_val != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in post_handler2\n"); + } + posth_val = preh_val + div_factor; +} + +static struct kprobe kp2 = { + .symbol_name = "kprobe_target2", + .pre_handler = kp_pre_handler2, + .post_handler = kp_post_handler2 +}; + +static int test_kprobes(void) +{ + int ret; + struct kprobe *kps[2] = {&kp, &kp2}; + + kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kprobes(kps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kprobes returned %d\n", ret); + return ret; + } + + preh_val = 0; + posth_val = 0; + ret = target(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler not called\n"); + handler_errors++; + } + + preh_val = 0; + posth_val = 0; + ret = target2(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler2 not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler2 not called\n"); + handler_errors++; + } + + unregister_kprobes(kps, 2); + return 0; + +} + static u32 j_kprobe_target(u32 value) { if (value != rand1) { @@ -121,7 +189,7 @@ static int test_jprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -132,6 +200,43 @@ static int test_jprobe(void) return 0; } +static struct jprobe jp2 = { + .entry = j_kprobe_target, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_jprobes(void) +{ + int ret; + struct jprobe *jps[2] = {&jp, &jp2}; + + jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_jprobes(jps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_jprobes returned %d\n", ret); + return ret; + } + + jph_val = 0; + ret = target(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler not called\n"); + handler_errors++; + } + + jph_val = 0; + ret = target2(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler2 not called\n"); + handler_errors++; + } + unregister_jprobes(jps, 2); + + return 0; +} #ifdef CONFIG_KRETPROBES static u32 krph_val; @@ -177,7 +282,7 @@ static int test_kretprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -187,12 +292,72 @@ static int test_kretprobe(void) return 0; } + +static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long ret = regs_return_value(regs); + + if (ret != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in kretprobe handler2\n"); + } + if (krph_val == 0) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "call to kretprobe entry handler failed\n"); + } + + krph_val = rand1; + return 0; +} + +static struct kretprobe rp2 = { + .handler = return_handler2, + .entry_handler = entry_handler, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_kretprobes(void) +{ + int ret; + struct kretprobe *rps[2] = {&rp, &rp2}; + + rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kretprobes(rps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kretprobe returned %d\n", ret); + return ret; + } + + krph_val = 0; + ret = target(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler not called\n"); + handler_errors++; + } + + krph_val = 0; + ret = target2(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler2 not called\n"); + handler_errors++; + } + unregister_kretprobes(rps, 2); + return 0; +} #endif /* CONFIG_KRETPROBES */ int init_test_probes(void) { int ret; + target = kprobe_target; + target2 = kprobe_target2; + do { rand1 = random32(); } while (rand1 <= div_factor); @@ -204,15 +369,30 @@ int init_test_probes(void) errors++; num_tests++; + ret = test_kprobes(); + if (ret < 0) + errors++; + + num_tests++; ret = test_jprobe(); if (ret < 0) errors++; + num_tests++; + ret = test_jprobes(); + if (ret < 0) + errors++; + #ifdef CONFIG_KRETPROBES num_tests++; ret = test_kretprobe(); if (ret < 0) errors++; + + num_tests++; + ret = test_kretprobes(); + if (ret < 0) + errors++; #endif /* CONFIG_KRETPROBES */ if (errors) diff --git a/kernel/time.c b/kernel/time.c index d63a4336fad6..4886e3ce83a4 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -37,6 +37,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/ptrace.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1ca99557e929..06f197560f3b 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -45,7 +45,7 @@ * * The value 8 is somewhat carefully chosen, as anything * larger can result in overflows. NSEC_PER_JIFFY grows as - * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ #define JIFFIES_SHIFT 8 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fa05e88aa76f..900f1b6598d1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + static struct timespec xtime_cache __attribute__ ((aligned (16))); void update_xtime_cache(u64 nsec) { @@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts) unsigned long seq; s64 nsecs; + WARN_ON(timekeeping_suspended); + do { seq = read_seqbegin(&xtime_lock); @@ -299,8 +304,6 @@ void __init timekeeping_init(void) write_sequnlock_irqrestore(&xtime_lock, flags); } -/* flag for if timekeeping is suspended */ -static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a9d9760dc7b6..8b0daf0662ef 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -168,7 +168,13 @@ rb_event_length(struct ring_buffer_event *event) */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - return rb_event_length(event); + unsigned length = rb_event_length(event); + if (event->type != RINGBUF_TYPE_DATA) + return length; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) + length -= sizeof(event->array[0]); + return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 2dc06ab35716..43f891b05a4b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; + stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar; |