From 0ce8974d504913a0f0ae2d97b20a5ac665431a41 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 03:13:27 -0800 Subject: sysctl: Consolidate !CONFIG_SYSCTL handling - In sysctl.h move functions only available if CONFIG_SYSCL is defined inside of #ifdef CONFIG_SYSCTL - Move the stub function definitions for !CONFIG_SYSCTL into sysctl.h and make them static inlines. Signed-off-by: Eric W. Biederman --- kernel/sysctl.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f257e05e..d5bbddd0de24 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2017,32 +2017,6 @@ void setup_sysctl_set(struct ctl_table_set *p, p->is_seen = is_seen; } -#else /* !CONFIG_SYSCTL */ -struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -{ - return NULL; -} - -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ -} - -void sysctl_head_put(struct ctl_table_header *head) -{ -} - #endif /* CONFIG_SYSCTL */ /* -- cgit v1.2.3 From de4e83bd6b5e16d491ec068cd22801d5d063b07a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 03:34:20 -0800 Subject: sysctl: Register the base sysctl table like any other sysctl table. Simplify the code by treating the base sysctl table like any other sysctl table and register it with register_sysctl_table. To ensure this table is registered early enough to avoid problems call sysctl_init from proc_sys_init. Rename sysctl_net.c:sysctl_init() to net_sysctl_init() to avoid name conflicts now that kernel/sysctl.c:sysctl_init() is no longer static. Signed-off-by: Eric W. Biederman --- kernel/sysctl.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d5bbddd0de24..ad460248acc7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -192,7 +192,7 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, #endif -static struct ctl_table root_table[]; +static struct ctl_table root_table[1]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { {{.count = 1, @@ -222,7 +222,7 @@ int sysctl_legacy_va_layout; /* The default sysctl tables: */ -static struct ctl_table root_table[] = { +static struct ctl_table sysctl_base_table[] = { { .procname = "kernel", .mode = 0555, @@ -1747,17 +1747,12 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) } } -static __init int sysctl_init(void) +int __init sysctl_init(void) { - sysctl_set_parent(NULL, root_table); -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - sysctl_check_table(current->nsproxy, root_table); -#endif + register_sysctl_table(sysctl_base_table); return 0; } -core_initcall(sysctl_init); - static struct ctl_table *is_branch_in(struct ctl_table *branch, struct ctl_table *table) { -- cgit v1.2.3 From 1f87f0b52b1d6581168cb80f86746bc4df918d01 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 04:07:15 -0800 Subject: sysctl: Move the implementation into fs/proc/proc_sysctl.c Move the core sysctl code from kernel/sysctl.c and kernel/sysctl_check.c into fs/proc/proc_sysctl.c. Currently sysctl maintenance is hampered by the sysctl implementation being split across 3 files with artificial layering between them. Consolidate the entire sysctl implementation into 1 file so that it is easier to see what is going on and hopefully allowing for simpler maintenance. For functions that are now only used in fs/proc/proc_sysctl.c remove their declarations from sysctl.h and make them static in fs/proc/proc_sysctl.c Signed-off-by: Eric W. Biederman --- kernel/Makefile | 1 - kernel/sysctl.c | 464 -------------------------------------------------- kernel/sysctl_check.c | 160 ----------------- 3 files changed, 625 deletions(-) delete mode 100644 kernel/sysctl_check.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2d9de86b7e76..cb41b9547c9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,7 +27,6 @@ obj-y += power/ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o -obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ad460248acc7..b774909ed46c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -192,20 +192,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, #endif -static struct ctl_table root_table[1]; -static struct ctl_table_root sysctl_table_root; -static struct ctl_table_header root_table_header = { - {{.count = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, - .root = &sysctl_table_root, - .set = &sysctl_table_root.default_set, -}; -static struct ctl_table_root sysctl_table_root = { - .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), -}; - static struct ctl_table kern_table[]; static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; @@ -1559,459 +1545,12 @@ static struct ctl_table dev_table[] = { { } }; -static DEFINE_SPINLOCK(sysctl_lock); - -/* called under sysctl_lock */ -static int use_table(struct ctl_table_header *p) -{ - if (unlikely(p->unregistering)) - return 0; - p->used++; - return 1; -} - -/* called under sysctl_lock */ -static void unuse_table(struct ctl_table_header *p) -{ - if (!--p->used) - if (unlikely(p->unregistering)) - complete(p->unregistering); -} - -/* called under sysctl_lock, will reacquire if has to wait */ -static void start_unregistering(struct ctl_table_header *p) -{ - /* - * if p->used is 0, nobody will ever touch that entry again; - * we'll eliminate all paths to it before dropping sysctl_lock - */ - if (unlikely(p->used)) { - struct completion wait; - init_completion(&wait); - p->unregistering = &wait; - spin_unlock(&sysctl_lock); - wait_for_completion(&wait); - spin_lock(&sysctl_lock); - } else { - /* anything non-NULL; we'll never dereference it */ - p->unregistering = ERR_PTR(-EINVAL); - } - /* - * do not remove from the list until nobody holds it; walking the - * list in do_sysctl() relies on that. - */ - list_del_init(&p->ctl_entry); -} - -void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - if (!--head->count) - kfree_rcu(head, rcu); - spin_unlock(&sysctl_lock); -} - -struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) -{ - if (!head) - BUG(); - spin_lock(&sysctl_lock); - if (!use_table(head)) - head = ERR_PTR(-ENOENT); - spin_unlock(&sysctl_lock); - return head; -} - -void sysctl_head_finish(struct ctl_table_header *head) -{ - if (!head) - return; - spin_lock(&sysctl_lock); - unuse_table(head); - spin_unlock(&sysctl_lock); -} - -static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = &root->default_set; - if (root->lookup) - set = root->lookup(root, namespaces); - return set; -} - -static struct list_head * -lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = lookup_header_set(root, namespaces); - return &set->list; -} - -struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev) -{ - struct ctl_table_root *root; - struct list_head *header_list; - struct ctl_table_header *head; - struct list_head *tmp; - - spin_lock(&sysctl_lock); - if (prev) { - head = prev; - tmp = &prev->ctl_entry; - unuse_table(prev); - goto next; - } - tmp = &root_table_header.ctl_entry; - for (;;) { - head = list_entry(tmp, struct ctl_table_header, ctl_entry); - - if (!use_table(head)) - goto next; - spin_unlock(&sysctl_lock); - return head; - next: - root = head->root; - tmp = tmp->next; - header_list = lookup_header_list(root, namespaces); - if (tmp != header_list) - continue; - - do { - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - if (root == &sysctl_table_root) - goto out; - header_list = lookup_header_list(root, namespaces); - } while (list_empty(header_list)); - tmp = header_list->next; - } -out: - spin_unlock(&sysctl_lock); - return NULL; -} - -struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) -{ - return __sysctl_head_next(current->nsproxy, prev); -} - -void register_sysctl_root(struct ctl_table_root *root) -{ - spin_lock(&sysctl_lock); - list_add_tail(&root->root_list, &sysctl_table_root.root_list); - spin_unlock(&sysctl_lock); -} - -/* - * sysctl_perm does NOT grant the superuser all rights automatically, because - * some sysctl variables are readonly even to root. - */ - -static int test_perm(int mode, int op) -{ - if (!current_euid()) - mode >>= 6; - else if (in_egroup_p(0)) - mode >>= 3; - if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) - return 0; - return -EACCES; -} - -int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) -{ - int mode; - - if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); - else - mode = table->mode; - - return test_perm(mode, op); -} - -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) -{ - for (; table->procname; table++) { - table->parent = parent; - if (table->child) - sysctl_set_parent(table, table->child); - } -} - int __init sysctl_init(void) { register_sysctl_table(sysctl_base_table); return 0; } -static struct ctl_table *is_branch_in(struct ctl_table *branch, - struct ctl_table *table) -{ - struct ctl_table *p; - const char *s = branch->procname; - - /* branch should have named subdirectory as its first element */ - if (!s || !branch->child) - return NULL; - - /* ... and nothing else */ - if (branch[1].procname) - return NULL; - - /* table should contain subdirectory with the same name */ - for (p = table; p->procname; p++) { - if (!p->child) - continue; - if (p->procname && strcmp(p->procname, s) == 0) - return p; - } - return NULL; -} - -/* see if attaching q to p would be an improvement */ -static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) -{ - struct ctl_table *to = p->ctl_table, *by = q->ctl_table; - struct ctl_table *next; - int is_better = 0; - int not_in_parent = !p->attached_by; - - while ((next = is_branch_in(by, to)) != NULL) { - if (by == q->attached_by) - is_better = 1; - if (to == p->attached_by) - not_in_parent = 1; - by = by->child; - to = next->child; - } - - if (is_better && not_in_parent) { - q->attached_by = by; - q->attached_to = to; - q->parent = p; - } -} - -/** - * __register_sysctl_paths - register a sysctl hierarchy - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * The members of the &struct ctl_table structure are used as follows: - * - * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not - * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file, and for sysctl(2) - * - * child - a pointer to the child sysctl table if this entry is a directory, or - * %NULL. - * - * proc_handler - the text handler routine (described below) - * - * de - for internal use by the sysctl routines - * - * extra1, extra2 - extra pointers usable by the proc handler routines - * - * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. - * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table - * struct enable minimal validation of the values being written to be - * performed, and the mode field allows minimal authentication. - * - * There must be a proc_handler routine for any terminal nodes - * mirrored under /proc/sys (non-terminals are handled by a built-in - * directory handler). Several default handlers are available to - * cover common cases - - * - * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), - * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), - * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() - * - * It is the handler's job to read the input buffer from user memory - * and process it. The handler should return 0 on success. - * - * This routine returns %NULL on a failure to register, and a pointer - * to the table header on success. - */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_root *root, - struct nsproxy *namespaces, - const struct ctl_path *path, struct ctl_table *table) -{ - struct ctl_table_header *header; - struct ctl_table *new, **prevp; - unsigned int n, npath; - struct ctl_table_set *set; - - /* Count the path components */ - for (npath = 0; path[npath].procname; ++npath) - ; - - /* - * For each path component, allocate a 2-element ctl_table array. - * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (procname == 0). - * - * We allocate everything in one go so that we don't have to - * worry about freeing additional memory in unregister_sysctl_table. - */ - header = kzalloc(sizeof(struct ctl_table_header) + - (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); - if (!header) - return NULL; - - new = (struct ctl_table *) (header + 1); - - /* Now connect the dots */ - prevp = &header->ctl_table; - for (n = 0; n < npath; ++n, ++path) { - /* Copy the procname */ - new->procname = path->procname; - new->mode = 0555; - - *prevp = new; - prevp = &new->child; - - new += 2; - } - *prevp = table; - header->ctl_table_arg = table; - - INIT_LIST_HEAD(&header->ctl_entry); - header->used = 0; - header->unregistering = NULL; - header->root = root; - sysctl_set_parent(NULL, header->ctl_table); - header->count = 1; -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - if (sysctl_check_table(namespaces, header->ctl_table)) { - kfree(header); - return NULL; - } -#endif - spin_lock(&sysctl_lock); - header->set = lookup_header_set(root, namespaces); - header->attached_by = header->ctl_table; - header->attached_to = root_table; - header->parent = &root_table_header; - for (set = header->set; set; set = set->parent) { - struct ctl_table_header *p; - list_for_each_entry(p, &set->list, ctl_entry) { - if (p->unregistering) - continue; - try_attach(p, header); - } - } - header->parent->count++; - list_add_tail(&header->ctl_entry, &header->set->list); - spin_unlock(&sysctl_lock); - - return header; -} - -/** - * register_sysctl_table_path - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, - path, table); -} - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} - -/** - * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table - * - * Unregisters the sysctl table and all children. proc entries may not - * actually be removed until they are no longer used by anyone. - */ -void unregister_sysctl_table(struct ctl_table_header * header) -{ - might_sleep(); - - if (header == NULL) - return; - - spin_lock(&sysctl_lock); - start_unregistering(header); - if (!--header->parent->count) { - WARN_ON(1); - kfree_rcu(header->parent, rcu); - } - if (!--header->count) - kfree_rcu(header, rcu); - spin_unlock(&sysctl_lock); -} - -int sysctl_is_seen(struct ctl_table_header *p) -{ - struct ctl_table_set *set = p->set; - int res; - spin_lock(&sysctl_lock); - if (p->unregistering) - res = 0; - else if (!set->is_seen) - res = 1; - else - res = set->is_seen(set); - spin_unlock(&sysctl_lock); - return res; -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ - INIT_LIST_HEAD(&p->list); - p->parent = parent ? parent : &sysctl_table_root.default_set; - p->is_seen = is_seen; -} - #endif /* CONFIG_SYSCTL */ /* @@ -2977,6 +2516,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); -EXPORT_SYMBOL(register_sysctl_table); -EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c deleted file mode 100644 index 362da653813d..000000000000 --- a/kernel/sysctl_check.c +++ /dev/null @@ -1,160 +0,0 @@ -#include -#include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include -#include - - -static int sysctl_depth(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth; - - depth = 0; - for (tmp = table; tmp->parent; tmp = tmp->parent) - depth++; - - return depth; -} - -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) -{ - int i; - - for (i = 0; table && i < n; i++) - table = table->parent; - - return table; -} - - -static void sysctl_print_path(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth, i; - depth = sysctl_depth(table); - if (table->procname) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk("/%s", tmp->procname?tmp->procname:""); - } - } - printk(" "); -} - -static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table_header *head; - struct ctl_table *ref, *test; - int depth, cur_depth; - - depth = sysctl_depth(table); - - for (head = __sysctl_head_next(namespaces, NULL); head; - head = __sysctl_head_next(namespaces, head)) { - cur_depth = depth; - ref = head->ctl_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->procname; ref++) { - int match = 0; - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - } - ref = NULL; -out: - sysctl_head_finish(head); - return ref; -} - -static void set_fail(const char **fail, struct ctl_table *table, const char *str) -{ - if (*fail) { - printk(KERN_ERR "sysctl table check failed: "); - sysctl_print_path(table); - printk(" %s\n", *fail); - dump_stack(); - } - *fail = str; -} - -static void sysctl_check_leaf(struct nsproxy *namespaces, - struct ctl_table *table, const char **fail) -{ - struct ctl_table *ref; - - ref = sysctl_check_lookup(namespaces, table); - if (ref && (ref != table)) - set_fail(fail, table, "Sysctl already exists"); -} - -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) -{ - int error = 0; - for (; table->procname; table++) { - const char *fail = NULL; - - if (table->parent) { - if (!table->parent->procname) - set_fail(&fail, table, "Parent without procname"); - } - if (table->child) { - if (table->data) - set_fail(&fail, table, "Directory with data?"); - if (table->maxlen) - set_fail(&fail, table, "Directory with maxlen?"); - if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) - set_fail(&fail, table, "Writable sysctl directory"); - if (table->proc_handler) - set_fail(&fail, table, "Directory with proc_handler"); - if (table->extra1) - set_fail(&fail, table, "Directory with extra1"); - if (table->extra2) - set_fail(&fail, table, "Directory with extra2"); - } else { - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - set_fail(&fail, table, "No data"); - if (!table->maxlen) - set_fail(&fail, table, "No maxlen"); - } -#ifdef CONFIG_PROC_SYSCTL - if (!table->proc_handler) - set_fail(&fail, table, "No proc_handler"); -#endif - sysctl_check_leaf(namespaces, table, &fail); - } - if (table->mode > 0777) - set_fail(&fail, table, "bogus .mode"); - if (fail) { - set_fail(&fail, table, NULL); - error = -EINVAL; - } - if (table->child) - error |= sysctl_check_table(namespaces, table->child); - } - return error; -} -- cgit v1.2.3 From 2ed86b16eabe4efbf80cc725a8cbb5310746a2fc Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 25 Jan 2012 20:02:40 -0600 Subject: irq: make SPARSE_IRQ an optionally hidden option On ARM, we don't want SPARSE_IRQ to be a user visible option. Make SPARSE_IRQ visible based on MAY_HAVE_SPARSE_IRQ instead of depending on HAVE_SPARSE_IRQ. With this, SPARSE_IRQ is not visible on C6X and ARM. Signed-off-by: Rob Herring Cc: Russell King Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Mundt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-c6x-dev@linux-c6x.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org --- kernel/irq/Kconfig | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5a38bf4de641..1f2dece9ad4c 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -13,7 +13,7 @@ config GENERIC_HARDIRQS # Options selectable by the architecture code # Make sparse irq Kconfig switch below available -config HAVE_SPARSE_IRQ +config MAY_HAVE_SPARSE_IRQ bool # Enable the generic irq autoprobe mechanism @@ -61,8 +61,7 @@ config IRQ_FORCED_THREADING bool config SPARSE_IRQ - bool "Support sparse irq numbering" - depends on HAVE_SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ ---help--- Sparse irq numbering is useful for distro kernels that want -- cgit v1.2.3 From 1fd36adcd98c14d2fd97f545293c488775cb2823 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Feb 2012 17:49:54 +0000 Subject: Replace the fd_sets in struct fdtable with an array of unsigned longs Replace the fd_sets in struct fdtable with an array of unsigned longs and then use the standard non-atomic bit operations rather than the FD_* macros. This: (1) Removes the abuses of struct fd_set: (a) Since we don't want to allocate a full fd_set the vast majority of the time, we actually, in effect, just allocate a just-big-enough array of unsigned longs and cast it to an fd_set type - so why bother with the fd_set at all? (b) Some places outside of the core fdtable handling code (such as SELinux) want to look inside the array of unsigned longs hidden inside the fd_set struct for more efficient iteration over the entire set. (2) Eliminates the use of FD_*() macros in the kernel completely. (3) Permits the __FD_*() macros to be deleted entirely where not exposed to userspace. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120216174954.23314.48147.stgit@warthog.procyon.org.uk Signed-off-by: H. Peter Anvin Cc: Al Viro --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6a..4db020015f14 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -473,7 +473,7 @@ static void close_files(struct files_struct * files) i = j * __NFDBITS; if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j++]; + set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); -- cgit v1.2.3 From 6684ba202b5ab2f36d574c72fe50c207d99b3e35 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 17:38:00 -0800 Subject: compat: Add helper functions to read/write struct timeval, timespec Add helper functions to read and write struct timeval and struct timespec from userspace. We already had helper functions for reading and writing struct compat_timespec; add a set of functions to do the same with struct timeval, and add a second suite of functions which can be sensitive to COMPAT_USE_64BIT_TIME and access either 32- or 64-bit time structures. This also exports these helper functions to modules. Rename the existing inlines for converting between struct compat_timeval and native struct timespec so we can have a saner naming convention for the exported functions. Suggested-by: Linus Torvalds Signed-off-by: H. Peter Anvin --- kernel/compat.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index f346cedfe24d..74ff8498809a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -31,11 +31,10 @@ #include /* - * Note that the native side is already converted to a timespec, because - * that's what we want anyway. + * Get/set struct timeval with struct timespec on the native side */ -static int compat_get_timeval(struct timespec *o, - struct compat_timeval __user *i) +static int compat_get_timeval_convert(struct timespec *o, + struct compat_timeval __user *i) { long usec; @@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o, return 0; } -static int compat_put_timeval(struct compat_timeval __user *o, - struct timeval *i) +static int compat_put_timeval_convert(struct compat_timeval __user *o, + struct timeval *i) { return (put_user(i->tv_sec, &o->tv_sec) || put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; @@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, if (tv) { struct timeval ktv; do_gettimeofday(&ktv); - if (compat_put_timeval(tv, &ktv)) + if (compat_put_timeval_convert(tv, &ktv)) return -EFAULT; } if (tz) { @@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, struct timezone ktz; if (tv) { - if (compat_get_timeval(&kts, tv)) + if (compat_get_timeval_convert(&kts, tv)) return -EFAULT; } if (tz) { @@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } +int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || + __get_user(tv->tv_sec, &ctv->tv_sec) || + __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(get_compat_timeval); + +int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || + __put_user(tv->tv_sec, &ctv->tv_sec) || + __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_compat_timeval); + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(get_compat_timespec); int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) { @@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user } EXPORT_SYMBOL_GPL(put_compat_timespec); +int compat_get_timeval(struct timeval *tv, const void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + else + return get_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_get_timeval); + +int compat_put_timeval(const struct timeval *tv, void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + else + return put_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_put_timeval); + +int compat_get_timespec(struct timespec *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + else + return get_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec); + +int compat_put_timespec(const struct timespec *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + else + return put_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec); + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; -- cgit v1.2.3 From 499e547057f5bba5cd6f87ebe59b05d0c59da905 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Feb 2012 15:50:28 -0500 Subject: tracing/ring-buffer: Only have tracing_on disable tracing buffers As the ring-buffer code is being used by other facilities in the kernel, having tracing_on file disable *all* buffers is not a desired affect. It should only disable the ftrace buffers that are being used. Move the code into the trace.c file and use the buffer disabling for tracing_on() and tracing_off(). This way only the ftrace buffers will be affected by them and other kernel utilities will not be confused to why their output suddenly stopped. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 157 +++++++++++++++++---------------------------- kernel/trace/trace.c | 107 ++++++++++++++++++++++++++++++ kernel/trace/trace.h | 1 + 3 files changed, 168 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f5b7b5c1195b..cf8d11e91efd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -154,33 +154,10 @@ enum { static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -/** - * tracing_on - enable all tracing buffers - * - * This function enables all tracing buffers that may have been - * disabled with tracing_off. - */ -void tracing_on(void) -{ - set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_on); +/* Used for individual buffers (after the counter) */ +#define RB_BUFFER_OFF (1 << 20) -/** - * tracing_off - turn off all tracing buffers - * - * This function stops all tracing buffers from recording data. - * It does not disable any overhead the tracers themselves may - * be causing. This function simply causes all recording to - * the ring buffers to fail. - */ -void tracing_off(void) -{ - clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_off); +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) /** * tracing_off_permanent - permanently disable ring buffers @@ -193,15 +170,6 @@ void tracing_off_permanent(void) set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); } -/** - * tracing_is_on - show state of ring buffers enabled - */ -int tracing_is_on(void) -{ - return ring_buffer_flags == RB_BUFFERS_ON; -} -EXPORT_SYMBOL_GPL(tracing_is_on); - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -2618,6 +2586,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable); +/** + * ring_buffer_record_off - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * This is different than ring_buffer_record_disable() as + * it works like an on/off switch, where as the disable() verison + * must be paired with a enable(). + */ +void ring_buffer_record_off(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd | RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_off); + +/** + * ring_buffer_record_on - restart writes into the buffer + * @buffer: The ring buffer to start writes to. + * + * This enables all writes to the buffer that was disabled by + * ring_buffer_record_off(). + * + * This is different than ring_buffer_record_enable() as + * it works like an on/off switch, where as the enable() verison + * must be paired with a disable(). + */ +void ring_buffer_record_on(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd & ~RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_on); + +/** + * ring_buffer_record_is_on - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * + * Returns true if the ring buffer is in a state that it accepts writes. + */ +int ring_buffer_record_is_on(struct ring_buffer *buffer) +{ + return !atomic_read(&buffer->record_disabled); +} + /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. @@ -4039,68 +4064,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_TRACING -static ssize_t -rb_simple_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - char buf[64]; - int r; - - if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) - r = sprintf(buf, "permanently disabled\n"); - else - r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -rb_simple_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - if (val) - set_bit(RB_BUFFERS_ON_BIT, p); - else - clear_bit(RB_BUFFERS_ON_BIT, p); - - (*ppos)++; - - return cnt; -} - -static const struct file_operations rb_simple_fops = { - .open = tracing_open_generic, - .read = rb_simple_read, - .write = rb_simple_write, - .llseek = default_llseek, -}; - - -static __init int rb_init_debugfs(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - - return 0; -} - -fs_initcall(rb_init_debugfs); -#endif - #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 10d5503f0d04..f3c13d63d064 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -351,6 +351,59 @@ static void wakeup_work_handler(struct work_struct *work) static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); +/** + * tracing_on - enable tracing buffers + * + * This function enables tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) +{ + if (global_trace.buffer) + ring_buffer_record_on(global_trace.buffer); + /* + * This flag is only looked at when buffers haven't been + * allocated yet. We don't really care about the race + * between setting this flag and actually turning + * on the buffer. + */ + global_trace.buffer_disabled = 0; +} +EXPORT_SYMBOL_GPL(tracing_on); + +/** + * tracing_off - turn off tracing buffers + * + * This function stops the tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + if (global_trace.buffer) + ring_buffer_record_on(global_trace.buffer); + /* + * This flag is only looked at when buffers haven't been + * allocated yet. We don't really care about the race + * between setting this flag and actually turning + * on the buffer. + */ + global_trace.buffer_disabled = 1; +} +EXPORT_SYMBOL_GPL(tracing_off); + +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + if (global_trace.buffer) + return ring_buffer_record_is_on(global_trace.buffer); + return !global_trace.buffer_disabled; +} +EXPORT_SYMBOL_GPL(tracing_is_on); + /** * trace_wake_up - wake up tasks waiting for trace input * @@ -4567,6 +4620,55 @@ static __init void create_trace_options_dir(void) create_trace_option_core_file(trace_options[i], i); } +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ring_buffer *buffer = filp->private_data; + char buf[64]; + int r; + + if (buffer) + r = ring_buffer_record_is_on(buffer); + else + r = 0; + + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ring_buffer *buffer = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (buffer) { + if (val) + ring_buffer_record_on(buffer); + else + ring_buffer_record_off(buffer); + } + + (*ppos)++; + + return cnt; +} + +static const struct file_operations rb_simple_fops = { + .open = tracing_open_generic, + .read = rb_simple_read, + .write = rb_simple_write, + .llseek = default_llseek, +}; + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -4626,6 +4728,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("trace_clock", 0644, d_tracer, NULL, &trace_clock_fops); + trace_create_file("tracing_on", 0644, d_tracer, + global_trace.buffer, &rb_simple_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -4863,6 +4968,8 @@ __init static int tracer_alloc_buffers(void) goto out_free_cpumask; } global_trace.entries = ring_buffer_size(global_trace.buffer); + if (global_trace.buffer_disabled) + tracing_off(); #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 54faec790bc1..ce887c0eca56 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -154,6 +154,7 @@ struct trace_array { struct ring_buffer *buffer; unsigned long entries; int cpu; + int buffer_disabled; cycle_t time_start; struct task_struct *waiter; struct trace_array_cpu *data[NR_CPUS]; -- cgit v1.2.3 From 8c9cf542b8a66c231747a550573d910daf17f0e9 Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 27 Feb 2012 09:08:21 +0100 Subject: tracing: Do not select FRAME_POINTER on PPC On PowerPC, FUNCTION_TRACER selects FRAME_POINTER, even though the architecture does not support it. This causes the following warning: warning: (LOCKDEP && FAULT_INJECTION_STACKTRACE_FILTER && LATENCYTOP && FUNCTION_TRACER && KMEMCHECK) selects FRAME_POINTER which has unmet direct dependencies (DEBUG_KERNEL && (CRIS || M68K || FRV || UML || AVR32 || SUPERH || BLACKFIN || MN10300) || ARCH_WANT_FRAME_POINTERS) So remove the warning by adding the extra condition "if !PPC" to FUNCTION_TRACER for FRAME_POINTER selection Link: http://lkml.kernel.org/r/1330330101-8618-1-git-send-email-gerlando.falauto@keymile.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Gerlando Falauto Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cd3134510f3d..a1d2849f2473 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,7 +141,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE + select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER -- cgit v1.2.3 From 13ae246db4a02971ef4f557af1f6d3e21d64b710 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 29 Jan 2012 15:44:45 -0500 Subject: includecheck: delete any duplicate instances of module.h Different tree maintainers picked up independently generated trivial compile fixes based on linux-next testing, resulting in some cases where a file would have got more than one addition of module.h once everything was all merged together. Delete any duplicates so includecheck isn't complaining about anything related to module.h/export.h changes. Signed-off-by: Paul Gortmaker --- kernel/params.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 4bc965d8a1fe..47f5bf12434a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -15,7 +15,6 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include #include -- cgit v1.2.3 From b892e5c89787716b95a8e55d77d25a1c0748df10 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 1 Mar 2012 22:06:48 -0500 Subject: tracing: Keep NMI watchdog from triggering when dumping trace As ftrace_dump() (called by ftrace_dump_on_oops) disables interrupts as it dumps its output to the console, it can keep interrupts disabled for long periods of time. This is likely to trigger the NMI watchdog, and it can disrupt the output of critical data. Add a touch_nmi_watchdog() to each event that is written to the screen to keep the NMI watchdog from affecting the output. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f3c13d63d064..3a19c354edd6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "trace.h" @@ -4903,6 +4904,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); } + touch_nmi_watchdog(); trace_printk_seq(&iter.seq); } -- cgit v1.2.3 From 01f23e1630d944f7085cd8fd5793e31ea91c03d8 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Sun, 27 Nov 2011 21:43:10 +0000 Subject: sched/arch: Introduce the finish_arch_post_lock_switch() scheduler callback This callback is called by the scheduler after rq->lock has been released and interrupts enabled. It will be used in subsequent patches on the ARM architecture. Signed-off-by: Catalin Marinas Reviewed-by: Will Deacon Reviewed-by: Frank Rowand Tested-by: Will Deacon Tested-by: Marc Zyngier Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/20120313110840.7b444deb6b1bb902c15f3cdf@canb.auug.org.au Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/sched.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b342f57879e6..423f40f32a59 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1932,6 +1932,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); + finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); if (mm) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..d72483d07c9f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -692,6 +692,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #ifndef finish_arch_switch # define finish_arch_switch(prev) do { } while (0) #endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif #ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -- cgit v1.2.3 From db6544e0075d192e5ad16eda8689c55fa9c6f8f4 Mon Sep 17 00:00:00 2001 From: Rajesh Bhagat Date: Fri, 17 Feb 2012 13:59:15 +0530 Subject: ftrace: Fix function_graph for archs that test ftrace_trace_function When CONFIG_DYNAMIC_FTRACE is not set, some archs (ARM) test the variable function_trace_function to determine if it should call the function tracer. If it is not set to ftrace_stub, then it will call the function and return, and not call the function graph tracer. But some of these archs (ARM) do not have the assembly code to test if function tracing is enabled or not (quick stop of tracing) and it calls the helper routine ftrace_test_stop_func() instead. If function tracer is enabled and then disabled, the variable ftrace_trace_function is still set to the helper routine ftrace_test_stop_func(), and not to ftrace_stub. This will prevent the function graph tracer from ever running. Output before patch /debug/tracing # echo function > current_tracer /debug/tracing # echo function_graph > current_tracer /debug/tracing # cat trace Output after patch /debug/tracing # echo function > current_tracer /debug/tracing # echo function_graph > current_tracer /debug/tracing # cat trace 0) ! 253.375 us | } /* irq_enter */ 0) | generic_handle_irq() { 0) | handle_fasteoi_irq() { 0) 9.208 us | _raw_spin_lock(); 0) | handle_irq_event() { 0) | handle_irq_event_percpu() { Signed-off-by: Rajesh Bhagat Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 867bd1dd2dd0..0fa92f677c92 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -249,7 +249,8 @@ static void update_ftrace_function(void) #else __ftrace_trace_function = func; #endif - ftrace_trace_function = ftrace_test_stop_func; + ftrace_trace_function = + (func == ftrace_stub) ? func : ftrace_test_stop_func; #endif } -- cgit v1.2.3 From fa73dc9400516945bcbae8d98c23393bcefe1440 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 28 Feb 2012 11:02:46 +0000 Subject: tracing: Fix build breakage without CONFIG_PERF_EVENTS Today's -next fails to build for me: CC kernel/trace/trace_export.o In file included from kernel/trace/trace_export.c:197: kernel/trace/trace_entries.h:58: error: 'perf_ftrace_event_register' undeclared here (not in a function) make[2]: *** [kernel/trace/trace_export.o] Error 1 make[1]: *** [kernel/trace] Error 2 make: *** [kernel] Error 2 because as of ced390 (ftrace, perf: Add support to use function tracepoint in perf) perf_trace_event_register() is declared in trace.h only if CONFIG_PERF_EVENTS is enabled but I don't have that set. Ensure that we always have a definition of perf_trace_event_register() by making the definition unconditional. Link: http://lkml.kernel.org/r/1330426967-17067-1-git-send-email-broonie@opensource.wolfsonmicro.com Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Signed-off-by: Mark Brown Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ce887c0eca56..95059f091a24 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -836,13 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[]; filter) #include "trace_entries.h" -#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_FUNCTION_TRACER int perf_ftrace_event_register(struct ftrace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL #endif /* CONFIG_FUNCTION_TRACER */ -#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.3 From f695cf94837de53864180400cbac42cfa370426f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 14 Mar 2012 16:38:15 -0700 Subject: time: Fix change_clocksource locking change_clocksource() fails to grab locks or call timekeeping_update(), which leaves a race window for time inconsistencies. This adds proper locking and a call to timekeeping_update() to fix this. CC: Andy Lutomirski CC: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 403c2a092830..b53da5ecbea2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -448,9 +448,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset); static int change_clocksource(void *data) { struct clocksource *new, *old; + unsigned long flags; new = (struct clocksource *) data; + write_seqlock_irqsave(&timekeeper.lock, flags); + timekeeping_forward_now(); if (!new->enable || new->enable(new) == 0) { old = timekeeper.clock; @@ -458,6 +461,10 @@ static int change_clocksource(void *data) if (old->disable) old->disable(old); } + timekeeping_update(true); + + write_sequnlock_irqrestore(&timekeeper.lock, flags); + return 0; } -- cgit v1.2.3 From a4ca1298d8a0472a45624fa5fb99f90f0f367187 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Feb 2012 19:46:03 +0000 Subject: time: Remove bogus comments There is no global irq lock which makes a syscall magically SMP safe. Remove the outdated comment concerning do_settimeofday() as well. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 73e416db0a1e..ba744cf80696 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) return error; if (tz) { - /* SMP safe, global irq locking makes it work. */ sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { @@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) } } if (tv) - { - /* SMP safe, again the code in arch/foo/time.c should - * globally block out interrupts when it runs. - */ return do_settimeofday(tv); - } return 0; } -- cgit v1.2.3 From 01de982abf8c9e10fc3089e10585cd2cc914bdab Mon Sep 17 00:00:00 2001 From: Wolfgang Mauerer Date: Thu, 22 Mar 2012 11:18:20 +0100 Subject: tracing: Fix ftrace stack trace entries 8 hex characters tell only half the tale for 64 bit CPUs, so use the appropriate length. Link: http://lkml.kernel.org/r/1332411501-8059-2-git-send-email-wolfgang.mauerer@siemens.com Cc: stable@vger.kernel.org Signed-off-by: Wolfgang Mauerer Signed-off-by: Steven Rostedt --- kernel/trace/trace_entries.h | 16 ++++++++++++---- kernel/trace/trace_export.c | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f219..205dcac89206 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -156,6 +156,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, #define FTRACE_STACK_ENTRIES 8 +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, @@ -165,8 +171,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry, __dynamic_array(unsigned long, caller ) ), - F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], __entry->caller[6], __entry->caller[7]) @@ -181,8 +188,9 @@ FTRACE_ENTRY(user_stack, userstack_entry, __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), - F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], __entry->caller[6], __entry->caller[7]) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae3..ad4000c71be0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -150,7 +150,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define __dynamic_array(type, item) #undef F_printk -#define F_printk(fmt, args...) #fmt ", " __stringify(args) +#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ -- cgit v1.2.3 From 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 15 Mar 2012 13:04:03 -0700 Subject: ntp: Fix leap-second hrtimer livelock Since commit 7dffa3c673fbcf835cd7be80bb4aec8ad3f51168 the ntp subsystem has used an hrtimer for triggering the leapsecond adjustment. However, this can cause a potential livelock. Thomas diagnosed this as the following pattern: CPU 0 CPU 1 do_adjtimex() spin_lock_irq(&ntp_lock); process_adjtimex_modes(); timer_interrupt() process_adj_status(); do_timer() ntp_start_leap_timer(); write_lock(&xtime_lock); hrtimer_start(); update_wall_time(); hrtimer_reprogram(); ntp_tick_length() tick_program_event() spin_lock(&ntp_lock); clockevents_program_event() ktime_get() seq = req_seqbegin(xtime_lock); This patch tries to avoid the problem by reverting back to not using an hrtimer to inject leapseconds, and instead we handle the leapsecond processing in the second_overflow() function. The downside to this change is that on systems that support highres timers, the leap second processing will occur on a HZ tick boundary, (ie: ~1-10ms, depending on HZ) after the leap second instead of possibly sooner (~34us in my tests w/ x86_64 lapic). This patch applies on top of tip/timers/core. CC: Sasha Levin CC: Thomas Gleixner Reported-by: Sasha Levin Diagnoised-by: Thomas Gleixner Tested-by: Sasha Levin Signed-off-by: John Stultz --- kernel/time/ntp.c | 128 +++++++++++++++------------------------------- kernel/time/timekeeping.c | 20 +++----- 2 files changed, 47 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 6e039b144daf..3d17ebd47fa2 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -34,8 +34,6 @@ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; -static struct hrtimer leap_timer; - #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -381,70 +379,63 @@ u64 ntp_tick_length(void) /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset */ -static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) +int second_overflow(unsigned long secs) { - enum hrtimer_restart res = HRTIMER_NORESTART; - unsigned long flags; + s64 delta; int leap = 0; + unsigned long flags; spin_lock_irqsave(&ntp_lock, flags); + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ switch (time_state) { case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; break; case TIME_INS: - leap = -1; - time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); - hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); - res = HRTIMER_RESTART; + if (secs % 86400 == 0) { + leap = -1; + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + } break; case TIME_DEL: - leap = 1; - time_tai--; - time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); + if ((secs + 1) % 86400 == 0) { + leap = 1; + time_tai--; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + } break; case TIME_OOP: time_tai++; time_state = TIME_WAIT; - /* fall through */ + break; + case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; break; } - spin_unlock_irqrestore(&ntp_lock, flags); - - /* - * We have to call this outside of the ntp_lock to keep - * the proper locking hierarchy - */ - if (leap) - timekeeping_leap_insert(leap); - - return res; -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - */ -void second_overflow(void) -{ - s64 delta; - unsigned long flags; - spin_lock_irqsave(&ntp_lock, flags); /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -481,8 +472,13 @@ void second_overflow(void) tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; + + + out: spin_unlock_irqrestore(&ntp_lock, flags); + + return leap; } #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -544,27 +540,6 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif -/* - * Start the leap seconds timer: - */ -static inline void ntp_start_leap_timer(struct timespec *ts) -{ - long now = ts->tv_sec; - - if (time_status & STA_INS) { - time_state = TIME_INS; - now += 86400 - now % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - - return; - } - - if (time_status & STA_DEL) { - time_state = TIME_DEL; - now += 86400 - (now + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } -} /* * Propagate a new txc->status value into the NTP state: @@ -589,22 +564,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; - switch (time_state) { - case TIME_OK: - ntp_start_leap_timer(ts); - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - ntp_start_leap_timer(ts); - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } } /* * Called with the xtime lock held, so we can access and modify @@ -686,9 +645,6 @@ int do_adjtimex(struct timex *txc) (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; - - if (txc->modes & ADJ_STATUS && time_state != TIME_OK) - hrtimer_cancel(&leap_timer); } if (txc->modes & ADJ_SETOFFSET) { @@ -1010,6 +966,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); - hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - leap_timer.function = ntp_leap_second; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b53da5ecbea2..5d76e09ddd3d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -184,18 +184,6 @@ static void timekeeping_update(bool clearntp) } -void timekeeping_leap_insert(int leapsecond) -{ - unsigned long flags; - - write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeper.xtime.tv_sec += leapsecond; - timekeeper.wall_to_monotonic.tv_sec -= leapsecond; - timekeeping_update(false); - write_sequnlock_irqrestore(&timekeeper.lock, flags); - -} - /** * timekeeping_forward_now - update clock to the current time * @@ -969,9 +957,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; while (timekeeper.xtime_nsec >= nsecps) { + int leap; timekeeper.xtime_nsec -= nsecps; timekeeper.xtime.tv_sec++; - second_overflow(); + leap = second_overflow(timekeeper.xtime.tv_sec); + timekeeper.xtime.tv_sec += leap; } /* Accumulate raw time */ @@ -1082,9 +1072,11 @@ static void update_wall_time(void) * xtime.tv_nsec isn't larger then NSEC_PER_SEC */ if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { + int leap; timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; timekeeper.xtime.tv_sec++; - second_overflow(); + leap = second_overflow(timekeeper.xtime.tv_sec); + timekeeper.xtime.tv_sec += leap; } timekeeping_update(false); -- cgit v1.2.3 From c7206205d00ab375839bd6c7ddb247d600693c09 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Mar 2012 17:26:36 +0100 Subject: perf: Fix mmap_page capabilities and docs Complete the syscall-less self-profiling feature and address all complaints, namely: - capabilities, so we can detect what is actually available at runtime Add a capabilities field to perf_event_mmap_page to indicate what is actually available for use. - on x86: RDPMC weirdness due to being 40/48 bits and not sign-extending properly. - ABI documentation as to how all this stuff works. Also improve the documentation for the new features. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1332433596.2487.33.camel@twins Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c61234b1a988..dc3b05272511 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } -void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); - perf_update_user_clock(userpg, now); + arch_perf_update_userpage(userpg, now); barrier(); ++userpg->lock; -- cgit v1.2.3 From 6c16a6dcb05e51ace340ff7bc6dbe647f1593528 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Mar 2012 13:07:16 -0700 Subject: sched: Fix compiler warning about declared inline after use kernel/sched/fair.c:420: warning: 'account_cfs_rq_runtime' declared inline after being called kernel/sched/fair.c:420: warning: previous declaration of 'account_cfs_rq_runtime' was here kernel/sched/fair.c:1165: warning: 'return_cfs_rq_runtime' declared inlineafter being called kernel/sched/fair.c:1165: warning: previous declaration of 'return_cfs_rq_runtime' was here Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20120321200717.49BB4A024E@akpm.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11f3979bad2a..258f430d71a5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec); +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -1162,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) __clear_buddies_skip(se); } -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -1546,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, resched_task(rq_of(cfs_rq)->curr); } -static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -2073,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { -- cgit v1.2.3 From e335e3eb82dada2765297f6ba501afc7555aba10 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Thu, 22 Mar 2012 15:25:08 +0530 Subject: locking/kconfig: Simplify INLINE_SPIN_UNLOCK usage Get rid of INLINE_SPIN_UNLOCK entirely replacing it with UNINLINE_SPIN_UNLOCK instead of the reverse meaning. Whoever wants to change the default spinlock inlining behavior and uninline the spinlocks for some weird reason, such as spinlock debugging, paravirt etc. can now all just select UNINLINE_SPIN_UNLOCK Original discussion at: https://lkml.org/lkml/2012/3/21/357 Suggested-by: Linus Torvalds Signed-off-by: Raghavendra K T Cc: Linus Torvalds Cc: Ralf Baechle Cc: Chris Metcalf Cc: Chris Zankel Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20120322095502.30866.75756.sendpatchset@codeblue [ tidied up the changelog a bit ] Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 4 ++-- kernel/Kconfig.preempt | 1 + kernel/spinlock.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 5068e2a4e75f..2251882daf53 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ ARCH_INLINE_SPIN_LOCK_IRQSAVE -config INLINE_SPIN_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) +config UNINLINE_SPIN_UNLOCK + bool config INLINE_SPIN_UNLOCK_BH def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 24e7cb0ba26a..3f9c97419f02 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" select PREEMPT_COUNT + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 84c7d96918bf..5cdd8065a3ce 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) EXPORT_SYMBOL(_raw_spin_lock_bh); #endif -#ifndef CONFIG_INLINE_SPIN_UNLOCK +#ifdef CONFIG_UNINLINE_SPIN_UNLOCK void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); -- cgit v1.2.3 From ad30dfa94c5cc23931c822922a50bd163ab293a5 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 23 Mar 2012 15:52:25 -0700 Subject: alarmtimer: Make sure we initialize the rtctimer jonghwan Choi reported seeing warnings with the alarmtimer code at suspend/resume time, and pointed out that the rtctimer isn't being properly initialized. This patch corrects this issue. Reported-by: jonghwan Choi Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a46f5d64504..c16548807f1e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -46,9 +46,10 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +static struct rtc_timer rtctimer; + #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ -static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); @@ -783,6 +784,8 @@ static int __init alarmtimer_init(void) .nsleep = alarm_timer_nsleep, }; + rtc_timer_init(&rtctimer, NULL, NULL); + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); -- cgit v1.2.3 From e919cfd42da54d400e7e0385f22cae3672dcf874 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 22 Mar 2012 19:14:46 -0700 Subject: time: Avoid scary backtraces when warning of > 11% adj Folks have been getting a number of warnings about time adjustments > 11%. The WARN_ON leaves a big useless backtrace so this patch removes it for a printk_once(). I'm still working to narrow down the cause of the > 11% adjustment. Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5d76e09ddd3d..16a175bed355 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -869,13 +869,15 @@ static void timekeeping_adjust(s64 offset) } else /* No adjustment needed */ return; - WARN_ONCE(timekeeper.clock->maxadj && - (timekeeper.mult + adj > timekeeper.clock->mult + - timekeeper.clock->maxadj), - "Adjusting %s more then 11%% (%ld vs %ld)\n", + if (unlikely(timekeeper.clock->maxadj && + (timekeeper.mult + adj > + timekeeper.clock->mult + timekeeper.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", timekeeper.clock->name, (long)timekeeper.mult + adj, (long)timekeeper.clock->mult + timekeeper.clock->maxadj); + } /* * So the following can be confusing. * -- cgit v1.2.3 From 335dd85895abeca1957d5eaa3013dfe8dc60c7d7 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Sat, 11 Feb 2012 17:54:59 -0200 Subject: time: remove no_sync_cmos_clock Commit 9863c90f682fba34cdc26c3437e8c00da6c83fa4 (x86, vmware: Remove deprecated VMI kernel support) removed the only place which set no_sync_cmos_clock. Since that commit, this variable is never set. Signed-off-by: Cesar Eduardo Barros Signed-off-by: John Stultz --- kernel/time/ntp.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3d17ebd47fa2..f03fd83b170b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -483,9 +483,6 @@ out: #ifdef CONFIG_GENERIC_CMOS_UPDATE -/* Disable the cmos update - used by virtualization and embedded */ -int no_sync_cmos_clock __read_mostly; - static void sync_cmos_clock(struct work_struct *work); static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); @@ -532,8 +529,7 @@ static void sync_cmos_clock(struct work_struct *work) static void notify_cmos_timer(void) { - if (!no_sync_cmos_clock) - schedule_delayed_work(&sync_cmos_work, 0); + schedule_delayed_work(&sync_cmos_work, 0); } #else -- cgit v1.2.3 From 88b28adf6fcdd6d10a1cfc7765bb200d7366a265 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Wed, 14 Mar 2012 21:28:56 -0600 Subject: kernel-time: fix s/then/than/ spelling errors Use than for comparisons, like more than. CC: John Stultz Signed-off-by: Jim Cromie Signed-off-by: John Stultz --- kernel/time/clocksource.c | 2 +- kernel/time/timekeeping.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a45ca167ab24..c9583382141a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) { u64 ret; /* - * We won't try to correct for more then 11% adjustments (110,000 ppm), + * We won't try to correct for more than 11% adjustments (110,000 ppm), */ ret = (u64)cs->mult * 11; do_div(ret,100); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 16a175bed355..51b98568ba4d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -822,7 +822,7 @@ static void timekeeping_adjust(s64 offset) int adj; /* - * The point of this is to check if the error is greater then half + * The point of this is to check if the error is greater than half * an interval. * * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. @@ -830,7 +830,7 @@ static void timekeeping_adjust(s64 offset) * Note we subtract one in the shift, so that error is really error*2. * This "saves" dividing(shifting) interval twice, but keeps the * (error > interval) comparison as still measuring if error is - * larger then half an interval. + * larger than half an interval. * * Note: It does not "save" on aggravation when reading the code. */ @@ -838,7 +838,7 @@ static void timekeeping_adjust(s64 offset) if (error > interval) { /* * We now divide error by 4(via shift), which checks if - * the error is greater then twice the interval. + * the error is greater than twice the interval. * If it is greater, we need a bigadjust, if its smaller, * we can adjust by 1. */ @@ -949,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; u64 raw_nsecs; - /* If the offset is smaller then a shifted interval, do nothing */ + /* If the offset is smaller than a shifted interval, do nothing */ if (offset < timekeeper.cycle_interval<= timekeeper.cycle_interval) { @@ -1071,7 +1071,7 @@ static void update_wall_time(void) /* * Finally, make sure that after the rounding - * xtime.tv_nsec isn't larger then NSEC_PER_SEC + * xtime.tv_nsec isn't larger than NSEC_PER_SEC */ if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { int leap; -- cgit v1.2.3 From 8c5cf9e5c50dc902713897e10201aa71f3546aa1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:40 -0700 Subject: ptrace: don't modify flags on PTRACE_SETOPTIONS failure On ptrace(PTRACE_SETOPTIONS, pid, 0, ), we used to set those option bits which are known, and then fail with -EINVAL if there are some unknown bits in . This is inconsistent with typical error handling, which does not change any state if input is invalid. This patch changes PTRACE_SETOPTIONS behavior so that in this case, we return -EINVAL and don't change any bits in task->ptrace. It's very unlikely that there is userspace code in the wild which will be affected by this change: it should have the form ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_BOGUSOPT) where PTRACE_O_BOGUSOPT is a constant unknown to the kernel. But kernel headers, naturally, don't contain any PTRACE_O_BOGUSOPTs, thus the only way userspace can use one if it defines one itself. I can't see why anyone would do such a thing deliberately. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Reviewed-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 00ab2ca5ed11..273f56ea39d2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -528,6 +528,9 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; + child->ptrace &= ~PT_TRACE_MASK; if (data & PTRACE_O_TRACESYSGOOD) @@ -551,7 +554,7 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & PTRACE_O_TRACEEXIT) child->ptrace |= PT_TRACE_EXIT; - return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; + return 0; } static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) -- cgit v1.2.3 From 86b6c1f301faf085de5a3f9ce16b8de6e69c729b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:41 -0700 Subject: ptrace: simplify PTRACE_foo constants and PTRACE_SETOPTIONS code Exchange PT_TRACESYSGOOD and PT_PTRACE_CAP bit positions, which makes PT_option bits contiguous and therefore makes code in ptrace_setoptions() much simpler. Every PTRACE_O_TRACEevent is defined to (1 << PTRACE_EVENT_event) instead of using explicit numeric constants, to ensure we don't mess up relationship between bit positions and event ids. PT_EVENT_FLAG_SHIFT was not particularly useful, PT_OPT_FLAG_SHIFT with value of PT_EVENT_FLAG_SHIFT-1 is easier to use. PT_TRACE_MASK constant is nuked, the only its use is replaced by (PTRACE_O_MASK << PT_OPT_FLAG_SHIFT). Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Reviewed-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 273f56ea39d2..9acd07a6f5bb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -262,7 +262,7 @@ static int ptrace_attach(struct task_struct *task, long request, /* * Protect exec's credential calculations against our interference; - * interference; SUID, SGID and LSM creds get determined differently + * SUID, SGID and LSM creds get determined differently * under ptrace. */ retval = -ERESTARTNOINTR; @@ -528,31 +528,16 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { + unsigned flags; + if (data & ~(unsigned long)PTRACE_O_MASK) return -EINVAL; - child->ptrace &= ~PT_TRACE_MASK; - - if (data & PTRACE_O_TRACESYSGOOD) - child->ptrace |= PT_TRACESYSGOOD; - - if (data & PTRACE_O_TRACEFORK) - child->ptrace |= PT_TRACE_FORK; - - if (data & PTRACE_O_TRACEVFORK) - child->ptrace |= PT_TRACE_VFORK; - - if (data & PTRACE_O_TRACECLONE) - child->ptrace |= PT_TRACE_CLONE; - - if (data & PTRACE_O_TRACEEXEC) - child->ptrace |= PT_TRACE_EXEC; - - if (data & PTRACE_O_TRACEVFORKDONE) - child->ptrace |= PT_TRACE_VFORK_DONE; - - if (data & PTRACE_O_TRACEEXIT) - child->ptrace |= PT_TRACE_EXIT; + /* Avoid intermediate state when all opts are cleared */ + flags = child->ptrace; + flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); + flags |= (data << PT_OPT_FLAG_SHIFT); + child->ptrace = flags; return 0; } -- cgit v1.2.3 From aa9147c98f27550bd39416eca5a5844e54bced26 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:42 -0700 Subject: ptrace: make PTRACE_SEIZE set ptrace options specified in 'data' parameter This can be used to close a few corner cases in strace where we get unwanted racy behavior after attach, but before we have a chance to set options (the notorious post-execve SIGTRAP comes to mind), and removes the need to track "did we set opts for this task" state in strace internals. While we are at it: Make it possible to extend SEIZE in the future with more functionality by passing non-zero 'addr' parameter. To that end, error out if 'addr' is non-zero. PTRACE_ATTACH did not (and still does not) have such check, and users (strace) do pass garbage there... let's avoid repeating this mistake with SEIZE. Set all task->ptrace bits in one operation - before this change, we were adding PT_SEIZED and PT_PTRACE_CAP with task->ptrace |= BIT ops. This was probably ok (not a bug), but let's be on a safer side. Changes since v2: use (unsigned long) casts instead of (long) ones, move PTRACE_SEIZE_DEVEL-related code to separate lines of code. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Cc: Pedro Alves Reviewed-by: Oleg Nesterov Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9acd07a6f5bb..4661c5bc07e5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -231,6 +231,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) } static int ptrace_attach(struct task_struct *task, long request, + unsigned long addr, unsigned long flags) { bool seize = (request == PTRACE_SEIZE); @@ -238,19 +239,29 @@ static int ptrace_attach(struct task_struct *task, long request, /* * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL is used to prevent applications + * gradually. SEIZE_DEVEL bit is used to prevent applications * expecting full SEIZE behaviors trapping on kernel commits which * are still in the process of implementing them. * * Only test programs for new ptrace behaviors being implemented * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. * - * Once SEIZE behaviors are completely implemented, this flag and - * the following test will be removed. + * Once SEIZE behaviors are completely implemented, this flag + * will be removed. */ retval = -EIO; - if (seize && !(flags & PTRACE_SEIZE_DEVEL)) - goto out; + if (seize) { + if (addr != 0) + goto out; + if (!(flags & PTRACE_SEIZE_DEVEL)) + goto out; + flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL; + if (flags & ~(unsigned long)PTRACE_O_MASK) + goto out; + flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); + } else { + flags = PT_PTRACED; + } audit_ptrace(task); @@ -282,11 +293,11 @@ static int ptrace_attach(struct task_struct *task, long request, if (task->ptrace) goto unlock_tasklist; - task->ptrace = PT_PTRACED; if (seize) - task->ptrace |= PT_SEIZED; + flags |= PT_SEIZED; if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) - task->ptrace |= PT_PTRACE_CAP; + flags |= PT_PTRACE_CAP; + task->ptrace = flags; __ptrace_link(task, current); @@ -879,7 +890,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -1022,7 +1033,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. -- cgit v1.2.3 From ee00560c7dac1dbbf048446a8489550d0a5765b7 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:43 -0700 Subject: ptrace: remove PTRACE_SEIZE_DEVEL bit PTRACE_SEIZE code is tested and ready for production use, remove the code which requires special bit in data argument to make PTRACE_SEIZE work. Strace team prepares for a new release of strace, and we would like to ship the code which uses PTRACE_SEIZE, preferably after this change goes into released kernel. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4661c5bc07e5..ee8d49b9c309 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -237,25 +237,10 @@ static int ptrace_attach(struct task_struct *task, long request, bool seize = (request == PTRACE_SEIZE); int retval; - /* - * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL bit is used to prevent applications - * expecting full SEIZE behaviors trapping on kernel commits which - * are still in the process of implementing them. - * - * Only test programs for new ptrace behaviors being implemented - * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. - * - * Once SEIZE behaviors are completely implemented, this flag - * will be removed. - */ retval = -EIO; if (seize) { if (addr != 0) goto out; - if (!(flags & PTRACE_SEIZE_DEVEL)) - goto out; - flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL; if (flags & ~(unsigned long)PTRACE_O_MASK) goto out; flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); -- cgit v1.2.3 From 629d362b9950166c6fac2aa8425db34d824bb043 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:44 -0700 Subject: signal: give SEND_SIG_FORCED more power to beat SIGNAL_UNKILLABLE force_sig_info() and friends have the special semantics for synchronous signals, this interface should not be used if the target is not current. And it needs the fixes, in particular the clearing of SIGNAL_UNKILLABLE is not exactly right. However there are callers which have to use force_ exactly because it clears SIGNAL_UNKILLABLE and thus it can kill the CLONE_NEWPID tasks, although this is almost always is wrong by various reasons. With this patch SEND_SIG_FORCED ignores SIGNAL_UNKILLABLE, like we do if the signal comes from the ancestor namespace. This makes the naming in prepare_signal() paths insane, fixed by the next cleanup. Note: this only affects SIGKILL/SIGSTOP, but this is enough for force_sig() abusers. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e76001ccf5cd..2584f5a91fbe 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1059,7 +1059,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, from_ancestor_ns)) + if (!prepare_signal(sig, t, + from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; pending = group ? &t->signal->shared_pending : &t->pending; -- cgit v1.2.3 From def8cf72562e17ec8316ce0cb5697c7afd6400e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:45 -0700 Subject: signal: cosmetic, s/from_ancestor_ns/force/ in prepare_signal() paths Cosmetic, rename the from_ancestor_ns argument in prepare_signal() paths. After the previous change it doesn't match the reality. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2584f5a91fbe..d523da02dd14 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig) (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, - int from_ancestor_ns) +static int sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !from_ancestor_ns) + handler == SIG_DFL && !force) return 1; return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) +static int sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, from_ancestor_ns)) + if (!sig_task_ignored(t, sig, force)) return 0; /* @@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t) * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ -static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) +static int prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) } } - return !sig_ignored(p, sig, from_ancestor_ns); + return !sig_ignored(p, sig, force); } /* @@ -1602,7 +1601,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) ret = 1; /* the signal is ignored */ result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, 0)) + if (!prepare_signal(sig, t, false)) goto out; ret = 0; -- cgit v1.2.3 From a02d6fd643cbd4c559113b35b31d3b04e4ec60c7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:46 -0700 Subject: signal: zap_pid_ns_processes: s/SEND_SIG_NOINFO/SEND_SIG_FORCED/ Change zap_pid_ns_processes() to use SEND_SIG_FORCED, it looks more clear compared to SEND_SIG_NOINFO which relies on from_ancestor_ns logic send_signal(). It is also more efficient if we need to kill a lot of tasks because it doesn't alloc sigqueue. While at it, add the __fatal_signal_pending(task) check as a minor optimization. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a8968396046d..17b232869a04 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) while (nr > 0) { rcu_read_lock(); - /* - * Any nested-container's init processes won't ignore the - * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). - */ task = pid_task(find_vpid(nr), PIDTYPE_PID); - if (task) - send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); + if (task && !__fatal_signal_pending(task)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, task); rcu_read_unlock(); -- cgit v1.2.3 From b3449922502f5a161ee2b5022a33aec8472fbf18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:47 -0700 Subject: usermodehelper: introduce umh_complete(sub_info) Preparation. Add the new trivial helper, umh_complete(). Currently it simply does complete(sub_info->complete). Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index a0a88543934e..8ea25944ce33 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -199,6 +199,11 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info) } EXPORT_SYMBOL(call_usermodehelper_freeinfo); +static void umh_complete(struct subprocess_info *sub_info) +{ + complete(sub_info->complete); +} + /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -235,7 +240,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + umh_complete(sub_info); return 0; } @@ -269,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work) case UMH_WAIT_EXEC: if (pid < 0) sub_info->retval = pid; - complete(sub_info->complete); + umh_complete(sub_info); } } -- cgit v1.2.3 From d0bd587a80960d7ba7e0c8396e154028c9045c54 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:47 -0700 Subject: usermodehelper: implement UMH_KILLABLE Implement UMH_KILLABLE, should be used along with UMH_WAIT_EXEC/PROC. The caller must ensure that subprocess_info->path/etc can not go away until call_usermodehelper_freeinfo(). call_usermodehelper_exec(UMH_KILLABLE) does wait_for_completion_killable. If it fails, it uses xchg(&sub_info->complete, NULL) to serialize with umh_complete() which does the same xhcg() to access sub_info->complete. If call_usermodehelper_exec wins, it can safely return. umh_complete() should get NULL and call call_usermodehelper_freeinfo(). Otherwise we know that umh_complete() was already called, in this case call_usermodehelper_exec() falls back to wait_for_completion() which should succeed "very soon". Note: UMH_NO_WAIT == -1 but it obviously should not be used with UMH_KILLABLE. We delay the neccessary cleanup to simplify the back porting. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8ea25944ce33..f92f917c450c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -201,7 +201,15 @@ EXPORT_SYMBOL(call_usermodehelper_freeinfo); static void umh_complete(struct subprocess_info *sub_info) { - complete(sub_info->complete); + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); } /* Keventd can't block, but this (a child) can. */ @@ -252,6 +260,9 @@ static void __call_usermodehelper(struct work_struct *work) enum umh_wait wait = sub_info->wait; pid_t pid; + if (wait != UMH_NO_WAIT) + wait &= ~UMH_KILLABLE; + /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ @@ -461,9 +472,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, queue_work(khelper_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + wait_for_completion(&done); +wait_done: retval = sub_info->retval; - out: call_usermodehelper_freeinfo(sub_info); unlock: -- cgit v1.2.3 From 9d944ef32e83405a07376f112e9f02161d3e9731 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:48 -0700 Subject: usermodehelper: kill umh_wait, renumber UMH_* constants No functional changes. It is not sane to use UMH_KILLABLE with enum umh_wait, but obviously we do not want another argument in call_usermodehelper_* helpers. Kill this enum, use the plain int. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index f92f917c450c..8341de91613f 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -257,12 +257,9 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - enum umh_wait wait = sub_info->wait; + int wait = sub_info->wait & ~UMH_KILLABLE; pid_t pid; - if (wait != UMH_NO_WAIT) - wait &= ~UMH_KILLABLE; - /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ @@ -451,8 +448,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, - enum umh_wait wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); int retval = 0; -- cgit v1.2.3 From 5b9bd473e3b8a8c6c4ae99be475e6e9b27568555 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:49 -0700 Subject: usermodehelper: ____call_usermodehelper() doesn't need do_exit() Minor cleanup. ____call_usermodehelper() can simply return, no need to call do_exit() explicitely. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8341de91613f..685b246b13b0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -188,7 +188,7 @@ static int ____call_usermodehelper(void *data) /* Exec failed? */ fail: sub_info->retval = retval; - do_exit(0); + return 0; } void call_usermodehelper_freeinfo(struct subprocess_info *info) -- cgit v1.2.3 From 3e63a93b987685f02421e18b2aa452d20553a88b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:49 -0700 Subject: kmod: introduce call_modprobe() helper No functional changes. Move the call_usermodehelper code from __request_module() into the new simple helper, call_modprobe(). Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 685b246b13b0..56a29e812ff0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,21 @@ static DECLARE_RWSEM(umhelper_sem); */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static int call_modprobe(char *module_name, int wait) +{ + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + + char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; + + return call_usermodehelper_fns(modprobe_path, argv, envp, + wait, NULL, NULL, NULL); +} + /** * __request_module - try to load a kernel module * @wait: wait (or not) for the operation to complete @@ -81,11 +96,6 @@ int __request_module(bool wait, const char *fmt, ...) char module_name[MODULE_NAME_LEN]; unsigned int max_modprobes; int ret; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; static atomic_t kmod_concurrent = ATOMIC_INIT(0); #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; @@ -128,9 +138,7 @@ int __request_module(bool wait, const char *fmt, ...) trace_module_request(module_name, wait, _RET_IP_); - ret = call_usermodehelper_fns(modprobe_path, argv, envp, - wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, - NULL, NULL, NULL); + ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_dec(&kmod_concurrent); return ret; -- cgit v1.2.3 From 1cc684ab75123efe7ff446eb821d44375ba8fa30 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:50 -0700 Subject: kmod: make __request_module() killable As Tetsuo Handa pointed out, request_module() can stress the system while the oom-killed caller sleeps in TASK_UNINTERRUPTIBLE. The task T uses "almost all" memory, then it does something which triggers request_module(). Say, it can simply call sys_socket(). This in turn needs more memory and leads to OOM. oom-killer correctly chooses T and kills it, but this can't help because it sleeps in TASK_UNINTERRUPTIBLE and after that oom-killer becomes "disabled" by the TIF_MEMDIE task T. Make __request_module() killable. The only necessary change is that call_modprobe() should kmalloc argv and module_name, they can't live in the stack if we use UMH_KILLABLE. This memory is freed via call_usermodehelper_freeinfo()->cleanup. Reported-by: Tetsuo Handa Signed-off-by: Oleg Nesterov Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 56a29e812ff0..957a7aab8ebc 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,12 @@ static DECLARE_RWSEM(umhelper_sem); */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static void free_modprobe_argv(struct subprocess_info *info) +{ + kfree(info->argv[3]); /* check call_modprobe() */ + kfree(info->argv); +} + static int call_modprobe(char *module_name, int wait) { static char *envp[] = { @@ -69,10 +75,26 @@ static int call_modprobe(char *module_name, int wait) NULL }; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; + char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); + if (!argv) + goto out; + + module_name = kstrdup(module_name, GFP_KERNEL); + if (!module_name) + goto free_argv; + + argv[0] = modprobe_path; + argv[1] = "-q"; + argv[2] = "--"; + argv[3] = module_name; /* check free_modprobe_argv() */ + argv[4] = NULL; return call_usermodehelper_fns(modprobe_path, argv, envp, - wait, NULL, NULL, NULL); + wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); +free_argv: + kfree(argv); +out: + return -ENOMEM; } /** -- cgit v1.2.3 From b01c3a0010aabadf745f3e7fdb9cab682e0a28a2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Mar 2012 15:41:20 +0100 Subject: perf: Move mmap page data_head offset assertion out of header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Having the build time assertion in header is making the perf build fail on x86 with: ../../include/linux/perf_event.h:411:32: error: variably modified \ ‘__assert_mmap_data_head_offset’ at file scope [-Werror] I'm moving the build time validation out of the header, because I think it's better than to lessen the perf build warn/error check. Signed-off-by: Jiri Olsa Cc: acme@redhat.com Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: cjashfor@linux.vnet.ibm.com Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/1332513680-7870-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index dc3b05272511..3f92a19aa11e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7116,6 +7116,13 @@ void __init perf_event_init(void) /* do not patch jump label more than once per second */ jump_label_rate_limit(&perf_sched_events, HZ); + + /* + * Build time assertion that we keep the data_head at the intended + * location. IOW, validation we got the __reserved[] size right. + */ + BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) + != 1024); } static int __init perf_event_sysfs_init(void) -- cgit v1.2.3 From c5e14e763046b11dd8bf57b5dc9f3ab444af8e60 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Mar 2012 12:46:23 +0100 Subject: alarmtimer: Don't call rtc_timer_init() when CONFIG_RTC_CLASS=n rtc_timer_init() is not available when CONFIG_RTC_CLASS=n. Provide a proper wrapper in the RTC section of alarmtimer.c Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: John Stultz --- kernel/time/alarmtimer.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c16548807f1e..8a538c55fc7b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -46,10 +46,9 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); -static struct rtc_timer rtctimer; - #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); @@ -97,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev, return 0; } +static inline void alarmtimer_rtc_timer_init(void) +{ + rtc_timer_init(&rtctimer, NULL, NULL); +} + static struct class_interface alarmtimer_rtc_interface = { .add_dev = &alarmtimer_rtc_add_device, }; @@ -118,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void) #define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } +static inline void alarmtimer_rtc_timer_init(void) { } #endif /** @@ -784,7 +789,7 @@ static int __init alarmtimer_init(void) .nsleep = alarm_timer_nsleep, }; - rtc_timer_init(&rtctimer, NULL, NULL); + alarmtimer_rtc_timer_init(); posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); -- cgit v1.2.3 From 02608bef8f774c058779546926889a2f2717a499 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 1 Feb 2012 10:33:14 +0800 Subject: module: add kernel param to force disable module load Sometimes we need to test a kernel of same version with code or config option changes. We already have sysctl to disable module load, but add a kernel parameter will be more convenient. Since modules_disabled is int, so here use bint type in core_param. TODO: make sysctl accept bool and change modules_disabled to bool Signed-off-by: Dave Young Signed-off-by: Rusty Russell --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2c932760fd33..7e31da9750c0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ /* Block module loading/unloading? */ int modules_disabled = 0; +core_param(nomodule, modules_disabled, bint, 0); /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); -- cgit v1.2.3 From 8b8252813dee8e8cd453bb219731c36b268c69a7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 26 Mar 2012 12:50:51 +1030 Subject: module_param: remove support for bool parameters which are really int. module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. This eliminates that code (though leaves the flags field in the struct, for impending use). Signed-off-by: Rusty Russell --- kernel/params.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 47f5bf12434a..508828afb874 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -297,35 +297,18 @@ EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, const struct kernel_param *kp) { - bool v; - int ret; - /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ - ret = strtobool(val, &v); - if (ret) - return ret; - - if (kp->flags & KPARAM_ISBOOL) - *(bool *)kp->arg = v; - else - *(int *)kp->arg = v; - return 0; + return strtobool(val, kp->arg); } EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { - bool val; - if (kp->flags & KPARAM_ISBOOL) - val = *(bool *)kp->arg; - else - val = *(int *)kp->arg; - /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", val ? 'Y' : 'N'); + return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); @@ -343,7 +326,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp) struct kernel_param dummy; dummy.arg = &boolval; - dummy.flags = KPARAM_ISBOOL; ret = param_set_bool(val, &dummy); if (ret == 0) *(bool *)kp->arg = !boolval; @@ -372,7 +354,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp) /* Match bool exactly, by re-using it. */ boolkp = *kp; boolkp.arg = &v; - boolkp.flags |= KPARAM_ISBOOL; ret = param_set_bool(val, &boolkp); if (ret == 0) -- cgit v1.2.3 From 026cee0086fe1df4cf74691cf273062cc769617d Mon Sep 17 00:00:00 2001 From: Pawel Moll Date: Mon, 26 Mar 2012 12:50:51 +1030 Subject: params: _initcall-like kernel parameters This patch adds a set of macros that can be used to declare kernel parameters to be parsed _before_ initcalls at a chosen level are executed. We rename the now-unused "flags" field of struct kernel_param as the level. It's signed, for when we use this for early params as well, in future. Linker macro collating init calls had to be modified in order to add additional symbols between levels that are later used by the init code to split the calls into blocks. Signed-off-by: Pawel Moll Signed-off-by: Rusty Russell --- kernel/module.c | 3 ++- kernel/params.c | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 7e31da9750c0..6f6651a54590 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2923,7 +2923,8 @@ static struct module *load_module(void __user *umod, mutex_unlock(&module_mutex); /* Module is ready to execute: parsing args may do that. */ - err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); + err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, + -32768, 32767, NULL); if (err < 0) goto unlink; diff --git a/kernel/params.c b/kernel/params.c index 508828afb874..f37d82631347 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -87,6 +87,8 @@ static int parse_one(char *param, char *val, const struct kernel_param *params, unsigned num_params, + s16 min_level, + s16 max_level, int (*handle_unknown)(char *param, char *val)) { unsigned int i; @@ -95,6 +97,9 @@ static int parse_one(char *param, /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { + if (params[i].level < min_level + || params[i].level > max_level) + return 0; /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool && params[i].ops->set != param_set_bint) @@ -174,6 +179,8 @@ int parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, + s16 min_level, + s16 max_level, int (*unknown)(char *param, char *val)) { char *param, *val; @@ -189,7 +196,8 @@ int parse_args(const char *name, args = next_arg(args, ¶m, &val); irq_was_disabled = irqs_disabled(); - ret = parse_one(param, val, params, num, unknown); + ret = parse_one(param, val, params, num, + min_level, max_level, unknown); if (irq_was_disabled && !irqs_disabled()) { printk(KERN_WARNING "parse_args(): option '%s' enabled " "irq's!\n", param); @@ -374,7 +382,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, const struct kernel_param *kp), - u16 flags, + s16 level, unsigned int *num) { int ret; @@ -384,7 +392,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; - kp.flags = flags; + kp.level = level; *num = 0; /* We expect a comma-separated list of values. */ @@ -425,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->ops->set, kp->flags, + arr->elemsize, arr->ops->set, kp->level, arr->num ?: &temp_num); } -- cgit v1.2.3 From d53799be6758841e1ffb1fd3780f73d0ffe44432 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 26 Mar 2012 12:50:52 +1030 Subject: module: move __module_get and try_module_get() out of line. With the preempt, tracepoint and everything, it's getting a bit chubby. For an Ubuntu-based config: Before: $ size -t `find * -name '*.ko'` | grep TOTAL 56199906 3870760 1606616 61677282 3ad1ee2 (TOTALS) $ size vmlinux text data bss dec hex filename 8509342 850368 3358720 12718430 c2115e vmlinux After: $ size -t `find * -name '*.ko'` | grep TOTAL 56183760 3867892 1606616 61658268 3acd49c (TOTALS) $ size vmlinux text data bss dec hex filename 8501842 849088 3358720 12709650 c1ef12 vmlinux Signed-off-by: Steven Rostedt Acked-by: Ingo Molnar Signed-off-by: Rusty Russell (made all out-of-line) --- kernel/module.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6f6651a54590..294692d8fcd8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -904,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr, static struct module_attribute modinfo_refcnt = __ATTR(refcnt, 0444, show_refcnt, NULL); +void __module_get(struct module *module) +{ + if (module) { + preempt_disable(); + __this_cpu_inc(module->refptr->incs); + trace_module_get(module, _RET_IP_); + preempt_enable(); + } +} +EXPORT_SYMBOL(__module_get); + +bool try_module_get(struct module *module) +{ + bool ret = true; + + if (module) { + preempt_disable(); + + if (likely(module_is_live(module))) { + __this_cpu_inc(module->refptr->incs); + trace_module_get(module, _RET_IP_); + } else + ret = false; + + preempt_enable(); + } + return ret; +} +EXPORT_SYMBOL(try_module_get); + void module_put(struct module *module) { if (module) { -- cgit v1.2.3 From f946eeb9313ff1470758e171a60fe7438a2ded3f Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 30 Jan 2012 23:07:22 -0500 Subject: module: Remove module size limit Module size was limited to 64MB, this was legacy limitation due to vmalloc() which was removed a while ago. Limiting module size to 64MB is both pointless and affects real world use cases. Cc: Tim Abbott Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin Signed-off-by: Rusty Russell --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 294692d8fcd8..78ac6ec1e425 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2411,8 +2411,7 @@ static int copy_and_check(struct load_info *info, return -ENOEXEC; /* Suck in entire file: we'll want most of it. */ - /* vmalloc barfs on "unusual" numbers. Check here */ - if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) + if ((hdr = vmalloc(len)) == NULL) return -ENOMEM; if (copy_from_user(hdr, umod, len) != 0) { -- cgit v1.2.3 From 2baab4e90495ebc9826c93f79d74d6e60a828d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Mar 2012 15:57:01 +0100 Subject: sched: Fix select_fallback_rq() vs cpu_active/cpu_online Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was supposed to finally sort the cpu_active mess, instead uncovered more. Since CPU_STARTING is ran before setting the cpu online, there's a (small) window where the cpu has active,!online. If during this time there's a wakeup of a task that used to reside on that cpu select_task_rq() will use select_fallback_rq() to compute an alternative cpu to run on since we find !online. select_fallback_rq() however will compute the new cpu against cpu_active, this means that it can return the same cpu it started out with, the !online one, since that cpu is in fact marked active. This results in us trying to scheduling a task on an offline cpu and triggering a WARN in the IPI code. The solution proposed by Chuansheng Liu of setting cpu_active in set_cpu_online() is buggy, firstly not all archs actually use set_cpu_online(), secondly, not all archs call set_cpu_online() with IRQs disabled, this means we would introduce either the same race or the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on wrong CPU") -- albeit much narrower. [ By setting online first and active later we have a window of online,!active, fresh and bound kthreads have task_cpu() of 0 and since cpu0 isn't in tsk_cpus_allowed() we end up in select_fallback_rq() which excludes !active, resulting in a reset of ->cpus_allowed and the thread running all over the place. ] The solution is to re-work select_fallback_rq() to require active _and_ online. This makes the active,!online case work as expected, OTOH archs running CPU_STARTING after setting online are now vulnerable to the issue from fd8a7de17 -- these are alpha and blackfin. Reported-by: Chuansheng Liu Signed-off-by: Peter Zijlstra Cc: Mike Frysinger Cc: linux-alpha@vger.kernel.org Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 20 ++++------------- kernel/sched/core.c | 62 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..c9837b74ab96 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2195,7 +2195,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) mutex_unlock(&callback_mutex); } -int cpuset_cpus_allowed_fallback(struct task_struct *tsk) +void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpuset *cs; int cpu; @@ -2219,22 +2219,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary * set any mask even if it is not right from task_cs() pov, * the pending set_cpus_allowed_ptr() will fix things. + * + * select_fallback_rq() will fix things ups and set cpu_possible_mask + * if required. */ - - cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); - if (cpu >= nr_cpu_ids) { - /* - * Either tsk->cpus_allowed is wrong (see above) or it - * is actually empty. The latter case is only possible - * if we are racing with remove_tasks_in_empty_cpuset(). - * Like above we can temporary set any mask and rely on - * set_cpus_allowed_ptr() as synchronization point. - */ - do_set_cpus_allowed(tsk, cpu_possible_mask); - cpu = cpumask_any(cpu_active_mask); - } - - return cpu; } void cpuset_init_current_mems_allowed(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e3ccc13c4caa..9c1629c90b2d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1263,29 +1263,59 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { - int dest_cpu; const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + enum { cpuset, possible, fail } state = cpuset; + int dest_cpu; /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + for_each_cpu_mask(dest_cpu, *nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; + } - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - return dest_cpu; + for (;;) { + /* Any allowed, online CPU? */ + for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + goto out; + } - /* No more Mr. Nice Guy. */ - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); + switch (state) { + case cpuset: + /* No more Mr. Nice Guy. */ + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + + case possible: + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; + + case fail: + BUG(); + break; + } + } + +out: + if (state != cpuset) { + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk_sched("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } } return dest_cpu; -- cgit v1.2.3 From 1b028abc779b67b699daff55e27d2432f8d92666 Mon Sep 17 00:00:00 2001 From: Michael J Wang Date: Mon, 19 Mar 2012 22:26:19 +0000 Subject: sched/rt: Improve pick_next_highest_task_rt() Avoid extra work by continuing on to the next rt_rq if the highest prio task in current rt_rq is the same priority as our candidate task. More detailed explanation: if next is not NULL, then we have found a candidate task, and its priority is next->prio. Now we are looking for an even higher priority task in the other rt_rq's. idx is the highest priority in the current candidate rt_rq. In the current 3.3 code, if idx is equal to next->prio, we would start scanning the tasks in that rt_rq and replace the current candidate task with a task from that rt_rq. But the new task would only have a priority that is equal to our previous candidate task, so we have not advanced our goal of finding a higher prio task. So we should avoid the extra work by continuing on to the next rt_rq if idx is equal to next->prio. Signed-off-by: Michael J Wang Acked-by: Steven Rostedt Reviewed-by: Yong Zhang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/2EF88150C0EF2C43A218742ED384C1BC0FC83D6B@IRVEXCHMB08.corp.ad.broadcom.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b60dad720173..44af55e6d5d0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1428,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) next_idx: if (idx >= MAX_RT_PRIO) continue; - if (next && next->prio < idx) + if (next && next->prio <= idx) continue; list_for_each_entry(rt_se, array->queue + idx, run_list) { struct task_struct *p; -- cgit v1.2.3 From 12b5da349a8b94c9dbc3430a6bc42eabd9eaf50b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 Mar 2012 10:43:28 -0400 Subject: tracing: Fix ent_size in trace output When reading the trace file, the records of each of the per_cpu buffers are examined to find the next event to print out. At the point of looking at the event, the size of the event is recorded. But if the first event is chosen, the other events in the other CPU buffers will reset the event size that is stored in the iterator descriptor, causing the event size passed to the output functions to be incorrect. In most cases this is not a problem, but for the case of stack traces, it is. With the change to the stack tracing to record a dynamic number of back traces, the output depends on the size of the entry instead of the fixed 8 back traces. When the entry size is not correct, the back traces would not be fully printed. Note, reading from the per-cpu trace files were not affected. Reported-by: Thomas Gleixner Tested-by: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3a19c354edd6..ed7b5d1e12f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1698,6 +1698,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; + int next_size = 0; int cpu; /* @@ -1729,9 +1730,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, next_cpu = cpu; next_ts = ts; next_lost = lost_events; + next_size = iter->ent_size; } } + iter->ent_size = next_size; + if (ent_cpu) *ent_cpu = next_cpu; -- cgit v1.2.3 From 160594e99dbbb0a5600ad922c630952c7c1c14bf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 28 Mar 2012 13:46:09 +0300 Subject: cpusets: Remove an unused variable We don't use "cpu" any more after 2baab4e904 "sched: Fix select_fallback_rq() vs cpu_active/cpu_online". Signed-off-by: Dan Carpenter Cc: Paul Menage Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120328104608.GD29022@elgon.mountain Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c9837b74ab96..4ef4d7ecb9fb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2198,7 +2198,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpuset *cs; - int cpu; rcu_read_lock(); cs = task_cs(tsk); -- cgit v1.2.3 From d550bbd40c0e10aefa05103dadbe0ae42e683707 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Disintegrate asm/system.h for Sparc Disintegrate asm/system.h for Sparc. Signed-off-by: David Howells cc: sparclinux@vger.kernel.org --- kernel/signal.c | 1 + kernel/sysctl.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e76001ccf5cd..5120f1901f36 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "audit.h" /* audit_signal_info() */ /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d53046b905..04402ab7a046 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,6 +68,9 @@ #include #include #endif +#ifdef CONFIG_SPARC +#include +#endif #ifdef CONFIG_BSD_PROCESS_ACCT #include #endif -- cgit v1.2.3 From 96f951edb1f1bdbbc99b0cd458f9808bb83d58ae Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Add #includes needed to permit the removal of asm/system.h asm/system.h is a cause of circular dependency problems because it contains commonly used primitive stuff like barrier definitions and uncommonly used stuff like switch_to() that might require MMU definitions. asm/system.h has been disintegrated by this point on all arches into the following common segments: (1) asm/barrier.h Moved memory barrier definitions here. (2) asm/cmpxchg.h Moved xchg() and cmpxchg() here. #included in asm/atomic.h. (3) asm/bug.h Moved die() and similar here. (4) asm/exec.h Moved arch_align_stack() here. (5) asm/elf.h Moved AT_VECTOR_SIZE_ARCH here. (6) asm/switch_to.h Moved switch_to() here. Signed-off-by: David Howells --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 503d6426126d..157fb9b2b186 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From 9ffc93f203c18a70623f21950f1dd473c9ec48cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Remove all #inclusions of asm/system.h Remove all #inclusions of asm/system.h preparatory to splitting and killing it. Performed with the following command: perl -p -i -e 's!^#\s*include\s*.*\n!!' `grep -Irl '^#\s*include\s*' *` Signed-off-by: David Howells --- kernel/debug/debug_core.c | 1 - kernel/debug/kdb/kdb_bt.c | 1 - kernel/dma.c | 1 - kernel/kexec.c | 1 - kernel/rwsem.c | 1 - kernel/sysctl.c | 1 - 6 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0d7c08784efb..de50f7debd40 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -52,7 +52,6 @@ #include #include #include -#include #include "debug_core.h" diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7179eac7b41c..07c9bbb94a0b 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "kdb_private.h" diff --git a/kernel/dma.c b/kernel/dma.c index 68a2306522c8..6c6262f86c17 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -18,7 +18,6 @@ #include #include #include -#include diff --git a/kernel/kexec.c b/kernel/kexec.c index a6a675cb9818..2a0deffa5dbe 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -37,7 +37,6 @@ #include #include #include -#include #include /* Per cpu memory for storing cpu states in case of system crash. */ diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b152f74f02de..6850f53e02d8 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -10,7 +10,6 @@ #include #include -#include #include /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 04402ab7a046..696f394c2cb0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -145,7 +145,6 @@ static const int cap_last_cap = CAP_LAST_CAP; #include #endif #ifdef CONFIG_SPARC -#include #endif #ifdef CONFIG_SPARC64 -- cgit v1.2.3 From fe2e39d8782d885755139304d8dba0b3e5bfa878 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 28 Mar 2012 23:29:45 +0200 Subject: firmware_class: Rework usermodehelper check Instead of two functions, read_lock_usermodehelper() and usermodehelper_is_disabled(), used in combination, introduce usermodehelper_read_trylock() that will only return with umhelper_sem held if usermodehelper_disabled is unset (and will return -EAGAIN otherwise) and make _request_firmware() use it. Rename read_unlock_usermodehelper() to usermodehelper_read_unlock() to follow the naming convention of the new function. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org --- kernel/kmod.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 957a7aab8ebc..4079ac1d5e79 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -339,17 +339,24 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) -void read_lock_usermodehelper(void) +int usermodehelper_read_trylock(void) { + int ret = 0; + down_read(&umhelper_sem); + if (usermodehelper_disabled) { + up_read(&umhelper_sem); + ret = -EAGAIN; + } + return ret; } -EXPORT_SYMBOL_GPL(read_lock_usermodehelper); +EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); -void read_unlock_usermodehelper(void) +void usermodehelper_read_unlock(void) { up_read(&umhelper_sem); } -EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); +EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** * usermodehelper_disable - prevent new helpers from being started @@ -390,15 +397,6 @@ void usermodehelper_enable(void) up_write(&umhelper_sem); } -/** - * usermodehelper_is_disabled - check if new helpers are allowed to be started - */ -bool usermodehelper_is_disabled(void) -{ - return usermodehelper_disabled; -} -EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); - static void helper_lock(void) { atomic_inc(&running_helpers); -- cgit v1.2.3 From 9b78c1da60b3c62ccdd1509f0902ad19ceaf776b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 28 Mar 2012 23:30:02 +0200 Subject: firmware_class: Do not warn that system is not ready from async loads If firmware is requested asynchronously, by calling request_firmware_nowait(), there is no reason to fail the request (and warn the user) when the system is (presumably temporarily) unready to handle it (because user space is not available yet or frozen). For this reason, introduce an alternative routine for read-locking umhelper_sem, usermodehelper_read_lock_wait(), that will wait for usermodehelper_disabled to be unset (possibly with a timeout) and make request_firmware_work_func() use it instead of usermodehelper_read_trylock(). Accordingly, modify request_firmware() so that it uses usermodehelper_read_trylock() to acquire umhelper_sem and remove the code related to that lock from _request_firmware(). Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org --- kernel/kmod.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 4079ac1d5e79..da7fcca279f9 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -333,6 +333,12 @@ static atomic_t running_helpers = ATOMIC_INIT(0); */ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); +/* + * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled + * to become 'false'. + */ +static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); + /* * Time to wait for running_helpers to become zero before the setting of * usermodehelper_disabled in usermodehelper_disable() fails @@ -352,12 +358,50 @@ int usermodehelper_read_trylock(void) } EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); +long usermodehelper_read_lock_wait(long timeout) +{ + DEFINE_WAIT(wait); + + if (timeout < 0) + return -EINVAL; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + up_read(&umhelper_sem); + + timeout = schedule_timeout(timeout); + if (!timeout) + break; + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return timeout; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); + void usermodehelper_read_unlock(void) { up_read(&umhelper_sem); } EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); +/** + * usermodehelper_enable - allow new helpers to be started again + */ +void usermodehelper_enable(void) +{ + down_write(&umhelper_sem); + usermodehelper_disabled = 0; + wake_up(&usermodehelper_disabled_waitq); + up_write(&umhelper_sem); +} + /** * usermodehelper_disable - prevent new helpers from being started */ @@ -381,22 +425,10 @@ int usermodehelper_disable(void) if (retval) return 0; - down_write(&umhelper_sem); - usermodehelper_disabled = 0; - up_write(&umhelper_sem); + usermodehelper_enable(); return -EAGAIN; } -/** - * usermodehelper_enable - allow new helpers to be started again - */ -void usermodehelper_enable(void) -{ - down_write(&umhelper_sem); - usermodehelper_disabled = 0; - up_write(&umhelper_sem); -} - static void helper_lock(void) { atomic_inc(&running_helpers); -- cgit v1.2.3 From 7b5179ac14dbad945647ac9e76bbbf14ed9e0dbe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 28 Mar 2012 23:30:14 +0200 Subject: PM / Hibernate: Disable usermode helpers right before freezing tasks There is no reason to call usermodehelper_disable() before creating memory bitmaps in hibernate() and software_resume(), so call it right before freeze_processes(), in accordance with the other suspend and hibernation code. Consequently, call usermodehelper_enable() right after the thawing of tasks rather than after freeing the memory bitmaps. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org --- kernel/power/hibernate.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0a186cfde788..639ff6e4ae9e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -611,19 +611,19 @@ int hibernate(void) if (error) goto Exit; - error = usermodehelper_disable(); - if (error) - goto Exit; - /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) - goto Enable_umh; + goto Exit; printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); + error = usermodehelper_disable(); + if (error) + goto Exit; + error = freeze_processes(); if (error) goto Free_bitmaps; @@ -660,9 +660,8 @@ int hibernate(void) freezer_test_done = false; Free_bitmaps: - free_basic_memory_bitmaps(); - Enable_umh: usermodehelper_enable(); + free_basic_memory_bitmaps(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -777,15 +776,13 @@ static int software_resume(void) if (error) goto close_finish; - error = usermodehelper_disable(); + error = create_basic_memory_bitmaps(); if (error) goto close_finish; - error = create_basic_memory_bitmaps(); - if (error) { - usermodehelper_enable(); + error = usermodehelper_disable(); + if (error) goto close_finish; - } pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); @@ -805,8 +802,8 @@ static int software_resume(void) swsusp_free(); thaw_processes(); Done: - free_basic_memory_bitmaps(); usermodehelper_enable(); + free_basic_memory_bitmaps(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); -- cgit v1.2.3 From 1e73203cd1157a03facc41ffb54050f5b28e55bd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 28 Mar 2012 23:30:21 +0200 Subject: PM / Sleep: Move disabling of usermode helpers to the freezer The core suspend/hibernation code calls usermodehelper_disable() to avoid race conditions between the freezer and the starting of usermode helpers and each code path has to do that on its own. However, it is always called right before freeze_processes() and usermodehelper_enable() is always called right after thaw_processes(). For this reason, to avoid code duplication and to make the connection between usermodehelper_disable() and the freezer more visible, make freeze_processes() call it and remove the direct usermodehelper_disable() and usermodehelper_enable() calls from all suspend/hibernation code paths. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org --- kernel/power/hibernate.c | 11 ----------- kernel/power/process.c | 7 +++++++ kernel/power/suspend.c | 7 ------- kernel/power/user.c | 10 +--------- 4 files changed, 8 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 639ff6e4ae9e..e09dfbfeecee 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -620,10 +619,6 @@ int hibernate(void) sys_sync(); printk("done.\n"); - error = usermodehelper_disable(); - if (error) - goto Exit; - error = freeze_processes(); if (error) goto Free_bitmaps; @@ -660,7 +655,6 @@ int hibernate(void) freezer_test_done = false; Free_bitmaps: - usermodehelper_enable(); free_basic_memory_bitmaps(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); @@ -780,10 +774,6 @@ static int software_resume(void) if (error) goto close_finish; - error = usermodehelper_disable(); - if (error) - goto close_finish; - pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); if (error) { @@ -802,7 +792,6 @@ static int software_resume(void) swsusp_free(); thaw_processes(); Done: - usermodehelper_enable(); free_basic_memory_bitmaps(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); diff --git a/kernel/power/process.c b/kernel/power/process.c index 0d2aeb226108..56eaac7e88ab 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * Timeout for stopping processes @@ -122,6 +123,10 @@ int freeze_processes(void) { int error; + error = usermodehelper_disable(); + if (error) + return error; + if (!pm_freezing) atomic_inc(&system_freezing_cnt); @@ -187,6 +192,8 @@ void thaw_processes(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); + usermodehelper_enable(); + schedule(); printk("done.\n"); } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 88e5c967370d..396d262b8fd0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -102,17 +101,12 @@ static int suspend_prepare(void) if (error) goto Finish; - error = usermodehelper_disable(); - if (error) - goto Finish; - error = suspend_freeze_processes(); if (!error) return 0; suspend_stats.failed_freeze++; dpm_save_failed_step(SUSPEND_FREEZE); - usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); @@ -259,7 +253,6 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); - usermodehelper_enable(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } diff --git a/kernel/power/user.c b/kernel/power/user.c index 33c4329205af..91b0fd021a95 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -222,14 +221,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, sys_sync(); printk("done.\n"); - error = usermodehelper_disable(); - if (error) - break; - error = freeze_processes(); - if (error) - usermodehelper_enable(); - else + if (!error) data->frozen = 1; break; @@ -238,7 +231,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; pm_restore_gfp_mask(); thaw_processes(); - usermodehelper_enable(); data->frozen = 0; break; -- cgit v1.2.3 From 247bc03742545fec2f79939a3b9f738392a0f7b4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 28 Mar 2012 23:30:28 +0200 Subject: PM / Sleep: Mitigate race between the freezer and request_firmware() There is a race condition between the freezer and request_firmware() such that if request_firmware() is run on one CPU and freeze_processes() is run on another CPU and usermodehelper_disable() called by it succeeds to grab umhelper_sem for writing before usermodehelper_read_trylock() called from request_firmware() acquires it for reading, the request_firmware() will fail and trigger a WARN_ON() complaining that it was called at a wrong time. However, in fact, it wasn't called at a wrong time and freeze_processes() simply happened to be executed simultaneously. To avoid this race, at least in some cases, modify usermodehelper_read_trylock() so that it doesn't fail if the freezing of tasks has just started and hasn't been completed yet. Instead, during the freezing of tasks, it will try to freeze the task that has called it so that it can wait until user space is thawed without triggering the scary warning. For this purpose, change usermodehelper_disabled so that it can take three different values, UMH_ENABLED (0), UMH_FREEZING and UMH_DISABLED. The first one means that usermode helpers are enabled, the last one means "hard disable" (i.e. the system is not ready for usermode helpers to be used) and the second one is reserved for the freezer. Namely, when freeze_processes() is started, it sets usermodehelper_disabled to UMH_FREEZING which tells usermodehelper_read_trylock() that it shouldn't fail just yet and should call try_to_freeze() if woken up and cannot return immediately. This way all freezable tasks that happen to call request_firmware() right before freeze_processes() is started and lose the race for umhelper_sem with it will be frozen and will sleep until thaw_processes() unsets usermodehelper_disabled. [For the non-freezable callers of request_firmware() the race for umhelper_sem against freeze_processes() is unfortunately unavoidable.] Reported-by: Stephen Boyd Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org --- kernel/kmod.c | 47 +++++++++++++++++++++++++++++++++++++---------- kernel/power/process.c | 3 ++- 2 files changed, 39 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index da7fcca279f9..05698a7415fe 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -322,7 +322,7 @@ static void __call_usermodehelper(struct work_struct *work) * land has been frozen during a system-wide hibernation or suspend operation). * Should always be manipulated under umhelper_sem acquired for write. */ -static int usermodehelper_disabled = 1; +static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; /* Number of helpers running */ static atomic_t running_helpers = ATOMIC_INIT(0); @@ -347,13 +347,30 @@ static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); int usermodehelper_read_trylock(void) { + DEFINE_WAIT(wait); int ret = 0; down_read(&umhelper_sem); - if (usermodehelper_disabled) { + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + if (usermodehelper_disabled == UMH_DISABLED) + ret = -EAGAIN; + up_read(&umhelper_sem); - ret = -EAGAIN; + + if (ret) + break; + + schedule(); + try_to_freeze(); + + down_read(&umhelper_sem); } + finish_wait(&usermodehelper_disabled_waitq, &wait); return ret; } EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); @@ -392,25 +409,35 @@ void usermodehelper_read_unlock(void) EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); /** - * usermodehelper_enable - allow new helpers to be started again + * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. + * depth: New value to assign to usermodehelper_disabled. + * + * Change the value of usermodehelper_disabled (under umhelper_sem locked for + * writing) and wakeup tasks waiting for it to change. */ -void usermodehelper_enable(void) +void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) { down_write(&umhelper_sem); - usermodehelper_disabled = 0; + usermodehelper_disabled = depth; wake_up(&usermodehelper_disabled_waitq); up_write(&umhelper_sem); } /** - * usermodehelper_disable - prevent new helpers from being started + * __usermodehelper_disable - Prevent new helpers from being started. + * @depth: New value to assign to usermodehelper_disabled. + * + * Set usermodehelper_disabled to @depth and wait for running helpers to exit. */ -int usermodehelper_disable(void) +int __usermodehelper_disable(enum umh_disable_depth depth) { long retval; + if (!depth) + return -EINVAL; + down_write(&umhelper_sem); - usermodehelper_disabled = 1; + usermodehelper_disabled = depth; up_write(&umhelper_sem); /* @@ -425,7 +452,7 @@ int usermodehelper_disable(void) if (retval) return 0; - usermodehelper_enable(); + __usermodehelper_set_disable_depth(UMH_ENABLED); return -EAGAIN; } diff --git a/kernel/power/process.c b/kernel/power/process.c index 56eaac7e88ab..19db29f67558 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -123,7 +123,7 @@ int freeze_processes(void) { int error; - error = usermodehelper_disable(); + error = __usermodehelper_disable(UMH_FREEZING); if (error) return error; @@ -135,6 +135,7 @@ int freeze_processes(void) error = try_to_freeze_tasks(true); if (!error) { printk("done."); + __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); } printk("\n"); -- cgit v1.2.3 From c4772d192c70b61d52262b0db76f7abd8aeb51c6 Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Wed, 28 Mar 2012 23:31:24 +0200 Subject: PM / QoS: add pm_qos_update_request_timeout() API The new API, pm_qos_update_request_timeout() is to provide a timeout with pm_qos_update_request. For example, pm_qos_update_request_timeout(req, 100, 1000), means that QoS request on req with value 100 will be active for 1000 microseconds. After 1000 microseconds, the QoS request thru req is reset. If there were another pm_qos_update_request(req, x) during the 1000 us, this new request with value x will override as this is another request on the same req handle. A new request on the same req handle will always override the previous request whether it is the conventional request or it is the new timeout request. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Acked-by: Mark Gross Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index d6d6dbd1ecc0..6a031e684026 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -229,6 +229,21 @@ int pm_qos_request_active(struct pm_qos_request *req) } EXPORT_SYMBOL_GPL(pm_qos_request_active); +/** + * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout + * @work: work struct for the delayed work (timeout) + * + * This cancels the timeout request by falling back to the default at timeout. + */ +static void pm_qos_work_fn(struct work_struct *work) +{ + struct pm_qos_request *req = container_of(to_delayed_work(work), + struct pm_qos_request, + work); + + pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); +} + /** * pm_qos_add_request - inserts new qos request into the list * @req: pointer to a preallocated handle @@ -253,6 +268,7 @@ void pm_qos_add_request(struct pm_qos_request *req, return; } req->pm_qos_class = pm_qos_class; + INIT_DELAYED_WORK(&req->work, pm_qos_work_fn); pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, &req->node, PM_QOS_ADD_REQ, value); } @@ -279,6 +295,9 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } + if (delayed_work_pending(&req->work)) + cancel_delayed_work_sync(&req->work); + if (new_value != req->node.prio) pm_qos_update_target( pm_qos_array[req->pm_qos_class]->constraints, @@ -286,6 +305,34 @@ void pm_qos_update_request(struct pm_qos_request *req, } EXPORT_SYMBOL_GPL(pm_qos_update_request); +/** + * pm_qos_update_request_timeout - modifies an existing qos request temporarily. + * @req : handle to list element holding a pm_qos request to use + * @new_value: defines the temporal qos request + * @timeout_us: the effective duration of this qos request in usecs. + * + * After timeout_us, this qos request is cancelled automatically. + */ +void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, + unsigned long timeout_us) +{ + if (!req) + return; + if (WARN(!pm_qos_request_active(req), + "%s called for unknown object.", __func__)) + return; + + if (delayed_work_pending(&req->work)) + cancel_delayed_work_sync(&req->work); + + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); + + schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us)); +} + /** * pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element @@ -305,6 +352,9 @@ void pm_qos_remove_request(struct pm_qos_request *req) return; } + if (delayed_work_pending(&req->work)) + cancel_delayed_work_sync(&req->work); + pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); -- cgit v1.2.3 From 3fc498f165304dc913f1d13b5ac9ab4c758ee7ab Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:43 -0700 Subject: smp: introduce a generic on_each_cpu_mask() function We have lots of infrastructure in place to partition multi-core systems such that we have a group of CPUs that are dedicated to specific task: cgroups, scheduler and interrupt affinity, and cpuisol= boot parameter. Still, kernel code will at times interrupt all CPUs in the system via IPIs for various needs. These IPIs are useful and cannot be avoided altogether, but in certain cases it is possible to interrupt only specific CPUs that have useful work to do and not the entire system. This patch set, inspired by discussions with Peter Zijlstra and Frederic Weisbecker when testing the nohz task patch set, is a first stab at trying to explore doing this by locating the places where such global IPI calls are being made and turning the global IPI into an IPI for a specific group of CPUs. The purpose of the patch set is to get feedback if this is the right way to go for dealing with this issue and indeed, if the issue is even worth dealing with at all. Based on the feedback from this patch set I plan to offer further patches that address similar issue in other code paths. This patch creates an on_each_cpu_mask() and on_each_cpu_cond() infrastructure API (the former derived from existing arch specific versions in Tile and Arm) and uses them to turn several global IPI invocation to per CPU group invocations. Core kernel: on_each_cpu_mask() calls a function on processors specified by cpumask, which may or may not include the local processor. You must not call this function with disabled interrupts or from a hardware interrupt handler or from a bottom half handler. arch/arm: Note that the generic version is a little different then the Arm one: 1. It has the mask as first parameter 2. It calls the function on the calling CPU with interrupts disabled, but this should be OK since the function is called on the other CPUs with interrupts disabled anyway. arch/tile: The API is the same as the tile private one, but the generic version also calls the function on the with interrupts disabled in UP case This is OK since the function is called on the other CPUs with interrupts disabled. Signed-off-by: Gilad Ben-Yossef Reviewed-by: Christoph Lameter Acked-by: Chris Metcalf Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Russell King Cc: Pekka Enberg Cc: Matt Mackall Cc: Rik van Riel Cc: Andi Kleen Cc: Sasha Levin Cc: Mel Gorman Cc: Alexander Viro Cc: Avi Kivity Acked-by: Michal Nazarewicz Cc: Kosaki Motohiro Cc: Milton Miller Cc: Russell King Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index db197d60489b..a081e6ce0e0a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -701,3 +701,32 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) return ret; } EXPORT_SYMBOL(on_each_cpu); + +/** + * on_each_cpu_mask(): Run a function on processors specified by + * cpumask, which may include the local processor. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait) +{ + int cpu = get_cpu(); + + smp_call_function_many(mask, func, info, wait); + if (cpumask_test_cpu(cpu, mask)) { + local_irq_disable(); + func(info); + local_irq_enable(); + } + put_cpu(); +} +EXPORT_SYMBOL(on_each_cpu_mask); -- cgit v1.2.3 From b3a7e98e024ffa9f7e4554dd720c508015c4a831 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:43 -0700 Subject: smp: add func to IPI cpus based on parameter func Add the on_each_cpu_cond() function that wraps on_each_cpu_mask() and calculates the cpumask of cpus to IPI by calling a function supplied as a parameter in order to determine whether to IPI each specific cpu. The function works around allocation failure of cpumask variable in CONFIG_CPUMASK_OFFSTACK=y by itereating over cpus sending an IPI a time via smp_call_function_single(). The function is useful since it allows to seperate the specific code that decided in each case whether to IPI a specific cpu for a specific request from the common boilerplate code of handling creating the mask, handling failures etc. [akpm@linux-foundation.org: s/gfpflags/gfp_flags/] [akpm@linux-foundation.org: avoid double-evaluation of `info' (per Michal), parenthesise evaluation of `cond_func'] [akpm@linux-foundation.org: s/CPU/CPUs, use all 80 cols in comment] Signed-off-by: Gilad Ben-Yossef Cc: Chris Metcalf Cc: Christoph Lameter Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Russell King Cc: Pekka Enberg Cc: Matt Mackall Cc: Sasha Levin Cc: Rik van Riel Cc: Andi Kleen Cc: Alexander Viro Cc: Avi Kivity Acked-by: Michal Nazarewicz Cc: Kosaki Motohiro Cc: Milton Miller Reviewed-by: "Srivatsa S. Bhat" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index a081e6ce0e0a..2f8b10ecf759 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -730,3 +730,64 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, put_cpu(); } EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * on_each_cpu_cond(): Call a function on each processor for which + * the supplied function cond_func returns true, optionally waiting + * for all the required CPUs to finish. This may include the local + * processor. + * @cond_func: A callback function that is passed a cpu id and + * the the info parameter. The function is called + * with preemption disabled. The function should + * return a blooean value indicating whether to IPI + * the specified CPU. + * @func: The function to run on all applicable CPUs. + * This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to both functions. + * @wait: If true, wait (atomically) until function has + * completed on other CPUs. + * @gfp_flags: GFP flags to use when allocating the cpumask + * used internally by the function. + * + * The function might sleep if the GFP flags indicates a non + * atomic allocation is allowed. + * + * Preemption is disabled to protect against CPUs going offline but not online. + * CPUs going online during the call will not be seen or sent an IPI. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + cpumask_var_t cpus; + int cpu, ret; + + might_sleep_if(gfp_flags & __GFP_WAIT); + + if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) + cpumask_set_cpu(cpu, cpus); + on_each_cpu_mask(cpus, func, info, wait); + preempt_enable(); + free_cpumask_var(cpus); + } else { + /* + * No free cpumask, bother. No matter, we'll + * just have to IPI them one by one. + */ + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) { + ret = smp_call_function_single(cpu, func, + info, wait); + WARN_ON_ONCE(!ret); + } + preempt_enable(); + } +} +EXPORT_SYMBOL(on_each_cpu_cond); -- cgit v1.2.3 From d034cfab4f7b9e768c5c1caaa56c5bd4805d2b92 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: kexec: crash: don't save swapper_pg_dir for !CONFIG_MMU configurations nommu platforms don't have very interesting swapper_pg_dir pointers and usually just #define them to NULL, meaning that we can't include them in the vmcoreinfo on the kexec crash path. This patch only saves the swapper_pg_dir if we have an MMU. Signed-off-by: Will Deacon Reviewed-by: Simon Horman Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index a6a675cb9818..769e347c5196 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1462,7 +1462,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmlist); -- cgit v1.2.3 From eaa3be6add6f327ab0a633e4fee8e6f2cc8c8a4c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: kexec: add further check to crashkernel When using crashkernel=2M-256M, the kernel doesn't give any warning. This is misleading sometimes. Signed-off-by: Zhenzhong Duan Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 769e347c5196..3288c9b29bae 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1359,6 +1359,10 @@ static int __init parse_crashkernel_simple(char *cmdline, if (*cur == '@') *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warning("crashkernel: unrecognized char\n"); + return -EINVAL; + } return 0; } -- cgit v1.2.3 From 5a04cca6c39cdd0b8c75b0628da634248f381b62 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 28 Mar 2012 14:42:50 -0700 Subject: sysctl: use bitmap library functions Use bitmap_set() instead of using set_bit() for each bit. This conversion is valid because the bitmap is private in the function call and atomic bitops were unnecessary. This also includes minor change. - Use bitmap_copy() for shorter typing Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d48ff4fd44c3..dbd70bdc1765 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -2393,9 +2394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } } - while (val_a <= val_b) - set_bit(val_a++, tmp_bitmap); - + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; proc_skip_char(&kbuf, &left, '\n'); } @@ -2438,8 +2437,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else - memcpy(bitmap, tmp_bitmap, - BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } kfree(tmp_bitmap); *lenp -= left; -- cgit v1.2.3 From cf3f89214ef6a33fad60856bc5ffd7bb2fc4709b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 28 Mar 2012 14:42:51 -0700 Subject: pidns: add reboot_pid_ns() to handle the reboot syscall In the case of a child pid namespace, rebooting the system does not really makes sense. When the pid namespace is used in conjunction with the other namespaces in order to create a linux container, the reboot syscall leads to some problems. A container can reboot the host. That can be fixed by dropping the sys_reboot capability but we are unable to correctly to poweroff/ halt/reboot a container and the container stays stuck at the shutdown time with the container's init process waiting indefinitively. After several attempts, no solution from userspace was found to reliabily handle the shutdown from a container. This patch propose to make the init process of the child pid namespace to exit with a signal status set to : SIGINT if the child pid namespace called "halt/poweroff" and SIGHUP if the child pid namespace called "reboot". When the reboot syscall is called and we are not in the initial pid namespace, we kill the pid namespace for "HALT", "POWEROFF", "RESTART", and "RESTART2". Otherwise we return EINVAL. Returning EINVAL is also an easy way to check if this feature is supported by the kernel when invoking another 'reboot' option like CAD. By this way the parent process of the child pid namespace knows if it rebooted or not and can take the right decision. Test case: ========== #include #include #include #include #include #include #include #include #include static int do_reboot(void *arg) { int *cmd = arg; if (reboot(*cmd)) printf("failed to reboot(%d): %m\n", *cmd); } int test_reboot(int cmd, int sig) { long stack_size = 4096; void *stack = alloca(stack_size) + stack_size; int status; pid_t ret; ret = clone(do_reboot, stack, CLONE_NEWPID | SIGCHLD, &cmd); if (ret < 0) { printf("failed to clone: %m\n"); return -1; } if (wait(&status) < 0) { printf("unexpected wait error: %m\n"); return -1; } if (!WIFSIGNALED(status)) { printf("child process exited but was not signaled\n"); return -1; } if (WTERMSIG(status) != sig) { printf("signal termination is not the one expected\n"); return -1; } return 0; } int main(int argc, char *argv[]) { int status; status = test_reboot(LINUX_REBOOT_CMD_RESTART, SIGHUP); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_RESTART) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_RESTART2, SIGHUP); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_RESTART2) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_HALT, SIGINT); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_HALT) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_POWER_OFF, SIGINT); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_POWERR_OFF) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_CAD_ON, -1); if (status >= 0) { printf("reboot(LINUX_REBOOT_CMD_CAD_ON) should have failed\n"); return 1; } printf("reboot(LINUX_REBOOT_CMD_CAD_ON) has failed as expected\n"); return 0; } [akpm@linux-foundation.org: tweak and add comments] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Daniel Lezcano Acked-by: Serge Hallyn Tested-by: Serge Hallyn Reviewed-by: Oleg Nesterov Cc: Michael Kerrisk Cc: "Eric W. Biederman" Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 33 +++++++++++++++++++++++++++++++++ kernel/sys.c | 9 +++++++++ 2 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 17b232869a04..57bc1fd35b3c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -15,6 +15,7 @@ #include #include #include +#include #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -183,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + if (pid_ns->reboot) + current->signal->group_exit_code = pid_ns->reboot; + acct_exit_ns(pid_ns); return; } @@ -217,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = { static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ + if (pid_ns == &init_pid_ns) + return 0; + + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART2: + case LINUX_REBOOT_CMD_RESTART: + pid_ns->reboot = SIGHUP; + break; + + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_HALT: + pid_ns->reboot = SIGINT; + break; + default: + return -EINVAL; + } + + read_lock(&tasklist_lock); + force_sig(SIGKILL, pid_ns->child_reaper); + read_unlock(&tasklist_lock); + + do_exit(0); + + /* Not reached */ + return 0; +} + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/sys.c b/kernel/sys.c index 9eb7fcab8df6..e7006eb6c1e4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + if (ret) + return ret; + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ -- cgit v1.2.3 From 5f054e31c63be774bf1ce252f20d56012a00f8a5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Mar 2012 15:38:31 +1030 Subject: documentation: remove references to cpu_*_map. This has been obsolescent for a while, fix documentation and misc comments. Signed-off-by: Rusty Russell --- kernel/cpuset.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1010cc61931f..eedeebe64b1a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = { * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, - * return cpu_online_map. Or if passed a NULL cs from an exit'ing - * task, return cpu_online_map. + * return cpu_online_mask. Or if passed a NULL cs from an exit'ing + * task, return cpu_online_mask. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_map. + * of cpu_online_mask. * * Call with callback_mutex held. */ @@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, int retval; int is_load_balanced; - /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ + /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) return -EACCES; @@ -2149,7 +2149,7 @@ void __init cpuset_init_smp(void) * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_map, even if this means going outside the + * subset of cpu_online_mask, even if this means going outside the * tasks cpuset. **/ -- cgit v1.2.3 From 6135fc1eb4b1c9ae5f535507ed59591bab51e630 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 28 Mar 2012 17:10:47 -0700 Subject: sched: Fix __schedule_bug() output when called from an interrupt If schedule is called from an interrupt handler __schedule_bug() will call show_regs() with the registers saved during the interrupt handling done in do_IRQ(). This means we'll see the registers and the backtrace for the process that was interrupted and not the full backtrace explaining who called schedule(). This is due to 838225b ("sched: use show_regs() to improve __schedule_bug() output", 2007-10-24) which improperly assumed that get_irq_regs() would return the registers for the current stack because it is being called from within an interrupt handler. Simply remove the show_reg() code so that we dump a backtrace for the interrupt handler that called schedule(). [ I ran across this when I was presented with a scheduling while atomic log with a stacktrace pointing at spin_unlock_irqrestore(). It made no sense and I had to guess what interrupt handler could be called and poke around for someone calling schedule() in an interrupt handler. A simple test of putting an msleep() in an interrupt handler works better with this patch because you can actually see the msleep() call in the backtrace. ] Also-reported-by: Chris Metcalf Signed-off-by: Stephen Boyd Cc: Satyam Sharma Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1332979847-27102-1-git-send-email-sboyd@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9c1629c90b2d..929fd857ef88 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3099,8 +3099,6 @@ EXPORT_SYMBOL(sub_preempt_count); */ static noinline void __schedule_bug(struct task_struct *prev) { - struct pt_regs *regs = get_irq_regs(); - if (oops_in_progress) return; @@ -3111,11 +3109,7 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - - if (regs) - show_regs(regs); - else - dump_stack(); + dump_stack(); } /* -- cgit v1.2.3 From 69592db298e400a7c175c4dfbe7a086c783f349d Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 21 Mar 2012 17:22:13 +0100 Subject: genirq: Minor readablity improvement in irq_wake_thread() exit_irq_thread() clears IRQTF_RUNTHREAD flag and drops the thread's bit in desc->threads_oneshot then. The bit must not be set again in between and it does not, since irq_wake_thread() sees PF_EXITING flag first and returns. Due to above the order or checking PF_EXITING and IRQTF_RUNTHREAD flags in irq_wake_thread() is important. This change just makes it more visible in the source code. Signed-off-by: Alexander Gordeev Link: http://lkml.kernel.org/r/20120321162212.GO24806@dhcp-26-207.brq.redhat.com Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6ff84e6a954c..bdb180325551 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -54,14 +54,18 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) { /* - * Wake up the handler thread for this action. In case the - * thread crashed and was killed we just pretend that we - * handled the interrupt. The hardirq handler has disabled the - * device interrupt, so no irq storm is lurking. If the + * In case the thread crashed and was killed we just pretend that + * we handled the interrupt. The hardirq handler has disabled the + * device interrupt, so no irq storm is lurking. + */ + if (action->thread->flags & PF_EXITING) + return; + + /* + * Wake up the handler thread for this action. If the * RUNTHREAD bit is already set, nothing to do. */ - if ((action->thread->flags & PF_EXITING) || - test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) return; /* -- cgit v1.2.3 From f3f79e38d51f8a419f4c484a86ece4baea35b993 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 21 Mar 2012 17:22:35 +0100 Subject: genirq: Get rid of unneeded force parameter in irq_finalize_oneshot() The only place irq_finalize_oneshot() is called with force parameter set is the threaded handler error exit path. But IRQTF_RUNTHREAD is dropped at this point and irq_wake_thread() is not going to set it again, since PF_EXITING is set for this thread already. So irq_finalize_oneshot() will drop the threads bit in threads_oneshot anyway and hence the force parameter is superfluous. Signed-off-by: Alexander Gordeev Link: http://lkml.kernel.org/r/20120321162234.GP24806@dhcp-26-207.brq.redhat.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b0ccd1ac2d6a..bf606a53a21c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -645,7 +645,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) * is marked MASKED. */ static void irq_finalize_oneshot(struct irq_desc *desc, - struct irqaction *action, bool force) + struct irqaction *action) { if (!(desc->istate & IRQS_ONESHOT)) return; @@ -679,7 +679,7 @@ again: * we would clear the threads_oneshot bit of this thread which * was just set. */ - if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) goto out_unlock; desc->threads_oneshot &= ~action->thread_mask; @@ -739,7 +739,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) local_bh_disable(); ret = action->thread_fn(action->irq, action->dev_id); - irq_finalize_oneshot(desc, action, false); + irq_finalize_oneshot(desc, action); local_bh_enable(); return ret; } @@ -755,7 +755,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, irqreturn_t ret; ret = action->thread_fn(action->irq, action->dev_id); - irq_finalize_oneshot(desc, action, false); + irq_finalize_oneshot(desc, action); return ret; } @@ -844,7 +844,7 @@ void exit_irq_thread(void) wake_threads_waitq(desc); /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action, true); + irq_finalize_oneshot(desc, action); } static void irq_setup_forced_threading(struct irqaction *new) -- cgit v1.2.3 From 241fc640be783f903e74b6d9c68481c01873f758 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 26 Mar 2012 15:02:18 -0400 Subject: genirq: Respect NUMA node affinity in setup_irq_irq affinity() We respect node affinity of devices already in the irq descriptor allocation, but we ignore it for the initial interrupt affinity setup, so the interrupt might be routed to a different node. Restrict the default affinity mask to the node on which the irq descriptor is allocated. [ tglx: Massaged changelog ] Signed-off-by: Prarit Bhargava Acked-by: Neil Horman Cc: Yinghai Lu Cc: David Rientjes Link: http://lkml.kernel.org/r/1332788538-17425-1-git-send-email-prarit@redhat.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index bf606a53a21c..89a3ea82569b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -282,7 +282,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; - int ret; + int ret, node = desc->irq_data.node; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -301,6 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) } cpumask_and(mask, cpu_online_mask, set); + if (node != NUMA_NO_NODE) { + const struct cpumask *nodemask = cpumask_of_node(node); + + /* make sure at least one of the cpus in nodemask is online */ + if (cpumask_intersects(mask, nodemask)) + cpumask_and(mask, mask, nodemask); + } ret = chip->irq_set_affinity(&desc->irq_data, mask, false); switch (ret) { case IRQ_SET_MASK_OK: -- cgit v1.2.3 From bdbb776f882f5ad431aa1e694c69c1c3d6a4a5b8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 19 Mar 2012 16:12:53 -0700 Subject: futex: Do not leak robust list to unprivileged process It was possible to extract the robust list head address from a setuid process if it had used set_robust_list(), allowing an ASLR info leak. This changes the permission checks to be the same as those used for similar info that comes out of /proc. Running a setuid program that uses robust futexes would have had: cred->euid != pcred->euid cred->euid == pcred->uid so the old permissions check would allow it. I'm not aware of any setuid programs that use robust futexes, so this is just a preventative measure. (This patch is based on changes from grsecurity.) Signed-off-by: Kees Cook Cc: Darren Hart Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Eric W. Biederman Cc: David Howells Cc: Serge E. Hallyn Cc: kernel-hardening@lists.openwall.com Cc: spender@grsecurity.net Link: http://lkml.kernel.org/r/20120319231253.GA20893@www.outflux.net Signed-off-by: Thomas Gleixner --- kernel/futex.c | 36 +++++++++++++----------------------- kernel/futex_compat.c | 36 +++++++++++++----------------------- 2 files changed, 26 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 72efa1e4359a..d701be57c423 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -59,6 +59,7 @@ #include #include #include +#include #include @@ -2443,40 +2444,29 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, { struct robust_list_head __user *head; unsigned long ret; - const struct cred *cred = current_cred(), *pcred; + struct task_struct *p; if (!futex_cmpxchg_enabled) return -ENOSYS; + rcu_read_lock(); + + ret = -ESRCH; if (!pid) - head = current->robust_list; + p = current; else { - struct task_struct *p; - - ret = -ESRCH; - rcu_read_lock(); p = find_task_by_vpid(pid); if (!p) goto err_unlock; - ret = -EPERM; - pcred = __task_cred(p); - /* If victim is in different user_ns, then uids are not - comparable, so we must have CAP_SYS_PTRACE */ - if (cred->user->user_ns != pcred->user->user_ns) { - if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; - goto ok; - } - /* If victim is in same user_ns, then uids are comparable */ - if (cred->euid != pcred->euid && - cred->euid != pcred->uid && - !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; -ok: - head = p->robust_list; - rcu_read_unlock(); } + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ)) + goto err_unlock; + + head = p->robust_list; + rcu_read_unlock(); + if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 5f9e689dc8f0..a9642d528630 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -136,40 +137,29 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, { struct compat_robust_list_head __user *head; unsigned long ret; - const struct cred *cred = current_cred(), *pcred; + struct task_struct *p; if (!futex_cmpxchg_enabled) return -ENOSYS; + rcu_read_lock(); + + ret = -ESRCH; if (!pid) - head = current->compat_robust_list; + p = current; else { - struct task_struct *p; - - ret = -ESRCH; - rcu_read_lock(); p = find_task_by_vpid(pid); if (!p) goto err_unlock; - ret = -EPERM; - pcred = __task_cred(p); - /* If victim is in different user_ns, then uids are not - comparable, so we must have CAP_SYS_PTRACE */ - if (cred->user->user_ns != pcred->user->user_ns) { - if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; - goto ok; - } - /* If victim is in same user_ns, then uids are comparable */ - if (cred->euid != pcred->euid && - cred->euid != pcred->uid && - !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) - goto err_unlock; -ok: - head = p->compat_robust_list; - rcu_read_unlock(); } + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ)) + goto err_unlock; + + head = p->compat_robust_list; + rcu_read_unlock(); + if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); -- cgit v1.2.3 From ec0c4274e33c0373e476b73e01995c53128f1257 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Mar 2012 12:08:55 -0700 Subject: futex: Mark get_robust_list as deprecated Notify get_robust_list users that the syscall is going away. Suggested-by: Thomas Gleixner Signed-off-by: Kees Cook Cc: Randy Dunlap Cc: Darren Hart Cc: Peter Zijlstra Cc: Jiri Kosina Cc: Eric W. Biederman Cc: David Howells Cc: Serge E. Hallyn Cc: kernel-hardening@lists.openwall.com Cc: spender@grsecurity.net Link: http://lkml.kernel.org/r/20120323190855.GA27213@www.outflux.net Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 ++ kernel/futex_compat.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d701be57c423..e2b0fb9a0b3b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2449,6 +2449,8 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, if (!futex_cmpxchg_enabled) return -ENOSYS; + WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); + rcu_read_lock(); ret = -ESRCH; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index a9642d528630..83e368b005fc 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -142,6 +142,8 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!futex_cmpxchg_enabled) return -ENOSYS; + WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); + rcu_read_lock(); ret = -ESRCH; -- cgit v1.2.3 From 107f8bdac992356b3a80d41c9f6ff4399159aa81 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 28 Mar 2012 08:42:34 +0200 Subject: padata: Add a reference to the api documentation Add a reference to the padata api documentation at Documentation/padata.txt Suggested-by: Peter Zijlstra Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 6f10eb285ece..78750882b2ab 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1,6 +1,8 @@ /* * padata.c - generic interface to process data streams in parallel * + * See Documentation/padata.txt for an api documentation. + * * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert * -- cgit v1.2.3 From 13614e0fb1a8840c134be35c179ff23e23676304 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 28 Mar 2012 08:43:21 +0200 Subject: padata: Use the online cpumask as the default We use the active cpumask to determine the superset of cpus to use for parallelization. However, the active cpumask is for internal usage of the scheduler and therefore not the appropriate cpumask for these purposes. So use the online cpumask instead. Reported-by: Peter Zijlstra Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 78750882b2ab..de3d0d97800a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -356,13 +356,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd, if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) return -ENOMEM; - cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { free_cpumask_var(pd->cpumask.cbcpu); return -ENOMEM; } - cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask); return 0; } @@ -566,7 +566,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier); static bool padata_validate_cpumask(struct padata_instance *pinst, const struct cpumask *cpumask) { - if (!cpumask_intersects(cpumask, cpu_active_mask)) { + if (!cpumask_intersects(cpumask, cpu_online_mask)) { pinst->flags |= PADATA_INVALID; return false; } @@ -680,7 +680,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd; - if (cpumask_test_cpu(cpu, cpu_active_mask)) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) { pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, pinst->cpumask.cbcpu); if (!pd) -- cgit v1.2.3 From 9612090527526a15832480c48b1f4b39e93e8a35 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 28 Mar 2012 08:44:07 +0200 Subject: padata: Fix cpu hotplug We don't remove the cpu that went offline from our cpumasks on cpu hotplug. This got lost somewhere along the line, so restore it. This fixes a hang of the padata instance on cpu hotplug. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index de3d0d97800a..89fe3d1b9efb 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -748,6 +748,9 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return -ENOMEM; padata_replace(pinst, pd); + + cpumask_clear_cpu(cpu, pd->cpumask.cbcpu); + cpumask_clear_cpu(cpu, pd->cpumask.pcpu); } return 0; -- cgit v1.2.3 From 092b2fb0766e7a0bf2e50d9cdd7d3b6bb5d12e19 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 29 Mar 2012 14:10:30 -0600 Subject: irqdomain: Remove powerpc dependency from debugfs file The debugfs code is really generic for all platforms. This patch removes the powerpc-specific directory reference and makes it available to all architectures. Signed-off-by: Grant Likely --- kernel/irq/Kconfig | 10 ++++++++++ kernel/irq/irqdomain.c | 8 ++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5a38bf4de641..d8e323d12496 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -56,6 +56,16 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +config IRQ_DOMAIN_DEBUG + bool "Expose hardware/virtual IRQ mapping via debugfs" + depends on IRQ_DOMAIN && DEBUG_FS + help + This option will show the mapping relationship between hardware irq + numbers and Linux irq numbers. The mapping is exposed via debugfs + in the file "virq_mapping". + + If you don't know what this means you don't need it. + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index af48e59bc2ff..3601f3fbf67c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -632,7 +632,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain, return revmap[hwirq]; } -#ifdef CONFIG_VIRQ_DEBUG +#ifdef CONFIG_IRQ_DOMAIN_DEBUG static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; @@ -668,7 +668,7 @@ static int virq_debug_show(struct seq_file *m, void *private) data = irq_desc_get_chip_data(desc); seq_printf(m, "0x%16p ", data); - if (desc->irq_data.domain->of_node) + if (desc->irq_data.domain && desc->irq_data.domain->of_node) p = desc->irq_data.domain->of_node->full_name; else p = none; @@ -695,14 +695,14 @@ static const struct file_operations virq_debug_fops = { static int __init irq_debugfs_init(void) { - if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root, + if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL, NULL, &virq_debug_fops) == NULL) return -ENOMEM; return 0; } __initcall(irq_debugfs_init); -#endif /* CONFIG_VIRQ_DEBUG */ +#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) -- cgit v1.2.3 From 78724b8ef83fc2bcfbc0a72a7ad8a3ce5ad25e6a Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 29 Mar 2012 06:17:17 -0500 Subject: kdb: Fix smatch warning on dbg_io_ops->is_console The Smatch tool warned that the change from commit b8adde8dd (kdb: Avoid using dbg_io_ops until it is initialized) should add another null check later in the kdb_printf(). It is worth noting that the second use of dbg_io_ops->is_console is protected by the KDB_PAGER state variable which would only get set when kdb is fully active and initialized. If we ever encounter changes or defects in the KDB_PAGER state we do not want to crash the kernel in a kdb_printf/printk. CC: Tim Bird Reported-by: Dan Carpenter Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9b5f17da1c56..bb9520f0f6ff 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -743,7 +743,7 @@ kdb_printit: kdb_input_flush(); c = console_drivers; - if (!dbg_io_ops->is_console) { + if (dbg_io_ops && !dbg_io_ops->is_console) { len = strlen(moreprompt); cp = moreprompt; while (len--) { -- cgit v1.2.3 From 98b54aa1a2241b59372468bd1e9c2d207bdba54b Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 21 Mar 2012 10:17:03 -0500 Subject: kgdb,debug_core: pass the breakpoint struct instead of address and memory There is extra state information that needs to be exposed in the kgdb_bpt structure for tracking how a breakpoint was installed. The debug_core only uses the the probe_kernel_write() to install breakpoints, but this is not enough for all the archs. Some arch such as x86 need to use text_poke() in order to install a breakpoint into a read only page. Passing the kgdb_bpt structure to kgdb_arch_set_breakpoint() and kgdb_arch_remove_breakpoint() allows other archs to set the type variable which indicates how the breakpoint was installed. Cc: stable@vger.kernel.org # >= 2.6.36 Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 53 +++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 3f88a45e6f0a..a7e52ca94563 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -161,37 +161,39 @@ early_param("nokgdbroundup", opt_nokgdbroundup); * Weak aliases for breakpoint management, * can be overriden by architectures when needed: */ -int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) +int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; - err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); + err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); if (err) return err; - - return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); + err = probe_kernel_write((char *)bpt->bpt_addr, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); + return err; } -int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) +int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { - return probe_kernel_write((char *)addr, - (char *)bundle, BREAK_INSTR_SIZE); + return probe_kernel_write((char *)bpt->bpt_addr, + (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } int __weak kgdb_validate_break_address(unsigned long addr) { - char tmp_variable[BREAK_INSTR_SIZE]; + struct kgdb_bkpt tmp; int err; - /* Validate setting the breakpoint and then removing it. In the + /* Validate setting the breakpoint and then removing it. If the * remove fails, the kernel needs to emit a bad message because we * are deep trouble not being able to put things back the way we * found them. */ - err = kgdb_arch_set_breakpoint(addr, tmp_variable); + tmp.bpt_addr = addr; + err = kgdb_arch_set_breakpoint(&tmp); if (err) return err; - err = kgdb_arch_remove_breakpoint(addr, tmp_variable); + err = kgdb_arch_remove_breakpoint(&tmp); if (err) printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " "memory destroyed at: %lx", addr); @@ -235,7 +237,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) */ int dbg_activate_sw_breakpoints(void) { - unsigned long addr; int error; int ret = 0; int i; @@ -244,16 +245,15 @@ int dbg_activate_sw_breakpoints(void) if (kgdb_break[i].state != BP_SET) continue; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_set_breakpoint(addr, - kgdb_break[i].saved_instr); + error = kgdb_arch_set_breakpoint(&kgdb_break[i]); if (error) { ret = error; - printk(KERN_INFO "KGDB: BP install failed: %lx", addr); + printk(KERN_INFO "KGDB: BP install failed: %lx", + kgdb_break[i].bpt_addr); continue; } - kgdb_flush_swbreak_addr(addr); + kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); kgdb_break[i].state = BP_ACTIVE; } return ret; @@ -302,7 +302,6 @@ int dbg_set_sw_break(unsigned long addr) int dbg_deactivate_sw_breakpoints(void) { - unsigned long addr; int error; int ret = 0; int i; @@ -310,15 +309,14 @@ int dbg_deactivate_sw_breakpoints(void) for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { if (kgdb_break[i].state != BP_ACTIVE) continue; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); + error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) { - printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); + printk(KERN_INFO "KGDB: BP remove failed: %lx\n", + kgdb_break[i].bpt_addr); ret = error; } - kgdb_flush_swbreak_addr(addr); + kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); kgdb_break[i].state = BP_SET; } return ret; @@ -352,7 +350,6 @@ int kgdb_isremovedbreak(unsigned long addr) int dbg_remove_all_break(void) { - unsigned long addr; int error; int i; @@ -360,12 +357,10 @@ int dbg_remove_all_break(void) for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { if (kgdb_break[i].state != BP_ACTIVE) goto setundefined; - addr = kgdb_break[i].bpt_addr; - error = kgdb_arch_remove_breakpoint(addr, - kgdb_break[i].saved_instr); + error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); if (error) printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", - addr); + kgdb_break[i].bpt_addr); setundefined: kgdb_break[i].state = BP_UNDEFINED; } -- cgit v1.2.3 From 8f121918f2e49f852de1acdc5255cc1ef440d85b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Mar 2012 22:03:33 -0700 Subject: cgroup: cgroup_attach_task() could return -errno after success 61d1d219c4 "cgroup: remove extra calls to find_existing_css_set" made cgroup_task_migrate() return void. An unfortunate side effect was that cgroup_attach_task() was depending on that function's return value to clear its @retval on the success path. On cgroup mounts without any subsystem with ->can_attach() callback, cgroup_attach_task() ended up returning @retval without initializing it on success. For some reason, gcc failed to warn about it and it didn't cause cgroup_attach_task() to return non-zero value in many cases, probably due to difference in register allocation. When the problem materializes, systemd fails to populate /systemd cgroup mount and fails to boot. Fix it by initializing @retval to zero on declaration. Signed-off-by: Tejun Heo Reported-by: Jiri Kosina LKML-Reference: Reviewed-by: Mandeep Singh Baines Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f4ea4b6f3cf1..ed64ccac67c9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1883,7 +1883,7 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval; + int retval = 0; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; -- cgit v1.2.3 From aa2bf9bc6414b6972b9e51903c1ce7b1f057aee2 Mon Sep 17 00:00:00 2001 From: Sasikantha babu Date: Wed, 21 Mar 2012 20:10:54 +0530 Subject: itimer: Schedule silent NULL pointer fixup in setitimer() for removal setitimer() should return -EFAULT if called with an invalid pointer for value. The current code excludes a NULL pointer from this rule and silently uses it to stop the timer. This violates the spec. Warn about user space apps which rely on that feature and schedule it for removal. [ tglx: Massaged changelog, warn message and Doc entry ] Signed-off-by: Sasikantha babu Link: http://lkml.kernel.org/r/1332340854-26053-1-git-send-email-sasikanth.v19@gmail.com Signed-off-by: Thomas Gleixner --- kernel/itimer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index 22000c3db0dd..c70369a74b5a 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -284,8 +284,11 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if (value) { if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) return -EFAULT; - } else + } else { memset((char *) &set_buffer, 0, sizeof(set_buffer)); + WARN_ONCE(1, "setitimer: new_value pointer is NULL." + " Misfeature support will be removed\n"); + } error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); if (error || !ovalue) -- cgit v1.2.3 From f5cb92ac82d06cb583c1f66666314c5c0a4d7913 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 30 Mar 2012 23:11:33 +0800 Subject: genirq: Adjust irq thread affinity on IRQ_SET_MASK_OK_NOCOPY return value irq_move_masked_irq() checks the return code of chip->irq_set_affinity() only for 0, but IRQ_SET_MASK_OK_NOCOPY is also a valid return code, which is there to avoid a redundant copy of the cpumask. But in case of IRQ_SET_MASK_OK_NOCOPY we not only avoid the redundant copy, we also fail to adjust the thread affinity of an eventually threaded interrupt handler. Handle IRQ_SET_MASK_OK (==0) and IRQ_SET_MASK_OK_NOCOPY(==1) return values correctly by checking the valid return values seperately. Signed-off-by: Jiang Liu Cc: Jiang Liu Cc: Keping Chen Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1333120296-13563-2-git-send-email-jiang.liu@huawei.com Signed-off-by: Thomas Gleixner --- kernel/irq/migration.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 47420908fba0..c3c89751b327 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -43,12 +43,16 @@ void irq_move_masked_irq(struct irq_data *idata) * masking the irqs. */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) - if (!chip->irq_set_affinity(&desc->irq_data, - desc->pending_mask, false)) { + < nr_cpu_ids)) { + int ret = chip->irq_set_affinity(&desc->irq_data, + desc->pending_mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: cpumask_copy(desc->irq_data.affinity, desc->pending_mask); + case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); } + } cpumask_clear(desc->pending_mask); } -- cgit v1.2.3 From e3831edd59edf57ca11fc289f08961b20baf5146 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 30 Mar 2012 19:40:28 +0530 Subject: sched: Fix incorrect usage of for_each_cpu_mask() in select_fallback_rq() The function for_each_cpu_mask() expects a *pointer* to struct cpumask as its second argument, whereas select_fallback_rq() passes the value itself. And moreover, for_each_cpu_mask() has been marked as obselete in include/linux/cpumask.h. So move to the more appropriate for_each_cpu() variant. Reported-by: Sasha Levin Signed-off-by: Srivatsa S. Bhat Acked-by: Peter Zijlstra Cc: Dave Jones Cc: Liu Chuansheng Cc: vapier@gentoo.org Cc: rusty@rustcorp.com.au Link: http://lkml.kernel.org/r/4F75BED4.9050005@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 985f6e595154..8773176b8c77 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1268,7 +1268,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) int dest_cpu; /* Look for allowed, online CPU in same node. */ - for_each_cpu_mask(dest_cpu, *nodemask) { + for_each_cpu(dest_cpu, nodemask) { if (!cpu_online(dest_cpu)) continue; if (!cpu_active(dest_cpu)) @@ -1279,7 +1279,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ - for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) { + for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { if (!cpu_online(dest_cpu)) continue; if (!cpu_active(dest_cpu)) -- cgit v1.2.3 From 3872c48b14259d8c0a00c9fff06a4a4123f7f4eb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 31 Mar 2012 12:45:43 +0200 Subject: tick: Document TICK_ONESHOT config option This option has been selected from arch code as it was assumed that it's necessary to support oneshot mode clockevent devices. But it's just a core internal helper to compile tick-oneshot.c if NOHZ or HIG_RES_TIMERS are selected. Reported-by: Russell King Signed-off-by: Thomas Gleixner --- kernel/time/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2cf9cc7aa103..a20dc8a3c949 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,6 +1,10 @@ # # Timer subsystem related configuration options # + +# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independ of this. config TICK_ONESHOT bool -- cgit v1.2.3 From 83e3fa6f0193299f8b7180db588edd5ca61a3b82 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 1 Apr 2012 16:38:37 -0400 Subject: irq_work: fix compile failure on MIPS from system.h split Builds of the MIPS platform ip32_defconfig fails as of commit 0195c00244dc ("Merge tag 'split-asm_system_h ...") because MIPS xchg() macro uses BUILD_BUG_ON and it was moved in commit b81947c646bf ("Disintegrate asm/system.h for MIPS"). The root cause is that the system.h split wasn't tested on a baseline with commit 6c03438edeb5 ("kernel.h: doesn't explicitly use bug.h, so don't include it.") Since this file uses BUG code in several other places besides the xchg call, simply make the inclusion explicit. Signed-off-by: Paul Gortmaker Acked-by: David Howells Signed-off-by: Linus Torvalds --- kernel/irq_work.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c3c46c72046e..0c56d44b9fd5 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -5,6 +5,7 @@ * context. The enqueueing is NMI-safe. */ +#include #include #include #include -- cgit v1.2.3 From 620f6e8e855d6d447688a5f67a4e176944a084e8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Apr 2012 11:40:19 -0700 Subject: sysctl: fix write access to dmesg_restrict/kptr_restrict Commit bfdc0b4 adds code to restrict access to dmesg_restrict, however, it incorrectly alters kptr_restrict rather than dmesg_restrict. The original patch from Richard Weinberger (https://lkml.org/lkml/2011/3/14/362) alters dmesg_restrict as expected, and so the patch seems to have been misapplied. This adds the CAP_SYS_ADMIN check to both dmesg_restrict and kptr_restrict, since both are sensitive. Reported-by: Phillip Lougher Signed-off-by: Kees Cook Acked-by: Serge Hallyn Acked-by: Richard Weinberger Cc: stable@vger.kernel.org Signed-off-by: James Morris --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 52b3a06a02f8..4ab11879aeb4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -170,7 +170,7 @@ static int proc_taint(struct ctl_table *table, int write, #endif #ifdef CONFIG_PRINTK -static int proc_dmesg_restrict(struct ctl_table *table, int write, +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -703,7 +703,7 @@ static struct ctl_table kern_table[] = { .data = &dmesg_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = &zero, .extra2 = &one, }, @@ -712,7 +712,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dmesg_restrict, + .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = &zero, .extra2 = &two, }, @@ -1943,7 +1943,7 @@ static int proc_taint(struct ctl_table *table, int write, } #ifdef CONFIG_PRINTK -static int proc_dmesg_restrict(struct ctl_table *table, int write, +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) -- cgit v1.2.3 From 234e340582901211f40d8c732afc49f0630ecf05 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 5 Apr 2012 14:25:11 -0700 Subject: simple_open: automatically convert to simple_open() Many users of debugfs copy the implementation of default_open() when they want to support a custom read/write function op. This leads to a proliferation of the default_open() implementation across the entire tree. Now that the common implementation has been consolidated into libfs we can replace all the users of this function with simple_open(). This replacement was done with the following semantic patch: @ open @ identifier open_f != simple_open; identifier i, f; @@ -int open_f(struct inode *i, struct file *f) -{ ( -if (i->i_private) -f->private_data = i->i_private; | -f->private_data = i->i_private; ) -return 0; -} @ has_open depends on open @ identifier fops; identifier open.open_f; @@ struct file_operations fops = { ... -.open = open_f, +.open = simple_open, ... }; [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Stephen Boyd Cc: Greg Kroah-Hartman Cc: Al Viro Cc: Julia Lawall Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/blktrace.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cdea7b56b0c9..c0bd0308741c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -311,13 +311,6 @@ int blk_trace_remove(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_trace_remove); -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { @@ -331,18 +324,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, static const struct file_operations blk_dropped_fops = { .owner = THIS_MODULE, - .open = blk_dropped_open, + .open = simple_open, .read = blk_dropped_read, .llseek = default_llseek, }; -static int blk_msg_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, size_t count, loff_t *ppos) { @@ -371,7 +357,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, static const struct file_operations blk_msg_fops = { .owner = THIS_MODULE, - .open = blk_msg_open, + .open = simple_open, .write = blk_msg_write, .llseek = noop_llseek, }; -- cgit v1.2.3 From 6f103929f8979d2638e58d7f7fda0beefcb8ee7e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 27 Mar 2012 15:09:37 -0400 Subject: nohz: Fix stale jiffies update in tick_nohz_restart() Fix tick_nohz_restart() to not use a stale ktime_t "now" value when calling tick_do_update_jiffies64(now). If we reach this point in the loop it means that we crossed a tick boundary since we grabbed the "now" timestamp, so at this point "now" refers to a time in the old jiffy, so using the old value for "now" is incorrect, and is likely to give us a stale jiffies value. In particular, the first time through the loop the tick_do_update_jiffies64(now) call is always a no-op, since the caller, tick_nohz_restart_sched_tick(), will have already called tick_do_update_jiffies64(now) with that "now" value. Note that tick_nohz_stop_sched_tick() already uses the correct approach: when we notice we cross a jiffy boundary, grab a new timestamp with ktime_get(), and *then* update jiffies. Signed-off-by: Neal Cardwell Cc: Ben Segall Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1332875377-23014-1-git-send-email-ncardwell@google.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3526038f2836..6a3a5b9ff561 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -534,9 +534,9 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_get_expires(&ts->sched_timer), 0)) break; } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); + /* Reread time and update jiffies */ now = ktime_get(); + tick_do_update_jiffies64(now); } } -- cgit v1.2.3 From 9886f444129171569461d8c39983e16f4871e3b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Apr 2012 10:50:55 +0200 Subject: itimer: Use printk_once instead of WARN_ONCE David pointed out, that WARN_ONCE() to report usage of an deprecated misfeature make folks unhappy. Use printk_once() instead. Andrew told me to stop grumbling and to remove the silly typecast while touching the file. Reported-by: David Rientjes Cc: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/itimer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index c70369a74b5a..8d262b467573 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -285,9 +285,10 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) return -EFAULT; } else { - memset((char *) &set_buffer, 0, sizeof(set_buffer)); - WARN_ONCE(1, "setitimer: new_value pointer is NULL." - " Misfeature support will be removed\n"); + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); } error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); -- cgit v1.2.3 From fa4da365bc7772c2cd6d5405bdf151612455f957 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 9 Apr 2012 15:41:44 -0700 Subject: clockevents: tTack broadcast device mode change in tick_broadcast_switch_to_oneshot() In the commit 77b0d60c5adf39c74039e2142a1d3cd1e4d53799, "clockevents: Leave the broadcast device in shutdown mode when not needed", we were bailing out too quickly in tick_broadcast_switch_to_oneshot(), with out tracking the broadcast device mode change to 'TICKDEV_MODE_ONESHOT'. This breaks the platforms which need broadcast device oneshot services during deep idle states. tick_broadcast_oneshot_control() thinks that it is in periodic mode and fails to take proper decisions based on the CLOCK_EVT_NOTIFY_BROADCAST_[ENTER, EXIT] notifications during deep idle entry/exit. Fix this by tracking the broadcast device mode as 'TICKDEV_MODE_ONESHOT', before leaving the broadcast HW device in shutdown mode if there are no active requests for the moment. Reported-and-tested-by: Santosh Shilimkar Signed-off-by: Suresh Siddha Cc: johnstul@us.ibm.com Link: http://lkml.kernel.org/r/1334011304.12400.81.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e883f57a3cd3..bf57abdc7bd0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,10 +575,12 @@ void tick_broadcast_switch_to_oneshot(void) unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + if (cpumask_empty(tick_get_broadcast_mask())) goto end; - tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); -- cgit v1.2.3 From 5b7526e3a640e491075557acaa842c59c652c0c3 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 5 Apr 2012 16:52:13 -0700 Subject: irq/irq_domain: Quit ignoring error returns from irq_alloc_desc_from(). In commit 4bbdd45a (irq_domain/powerpc: eliminate irq_map; use irq_alloc_desc() instead) code was added that ignores error returns from irq_alloc_desc_from() by (silently) casting the return value to unsigned. The negitive value error return now suddenly looks like a valid irq number. Commits cc79ca69 (irq_domain: Move irq_domain code from powerpc to kernel/irq) and 1bc04f2c (irq_domain: Add support for base irq and hwirq in legacy mappings) move this code to its current location in irqdomain.c The result of all of this is a null pointer dereference OOPS if one of the error cases is hit. The fix: Don't cast away the negativeness of the return value and then check for errors. Signed-off-by: David Daney Acked-by: Rob Herring [grant.likely: dropped addition of new 'irq' variable] Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3601f3fbf67c..9310a8d365b0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -350,7 +350,8 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int virq, hint; + unsigned int hint; + int virq; pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -381,9 +382,9 @@ unsigned int irq_create_mapping(struct irq_domain *domain, if (hint == 0) hint++; virq = irq_alloc_desc_from(hint, 0); - if (!virq) + if (virq <= 0) virq = irq_alloc_desc_from(1, 0); - if (!virq) { + if (virq <= 0) { pr_debug("irq: -> virq allocation failed\n"); return 0; } -- cgit v1.2.3 From ac5830a33f5b25eae1dc0708b3e7a3d270a6c07f Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 10 Apr 2012 15:25:42 +0300 Subject: irq_domain: correct the debugfs file name The actual name of the irq_domain mapping debugfs file is "irq_domain_mapping" not "virq_mapping". Signed-off-by: Mika Westerberg Signed-off-by: Grant Likely --- kernel/irq/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index cf1a4a68ce44..d1a758bc972a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -62,7 +62,7 @@ config IRQ_DOMAIN_DEBUG help This option will show the mapping relationship between hardware irq numbers and Linux irq numbers. The mapping is exposed via debugfs - in the file "virq_mapping". + in the file "irq_domain_mapping". If you don't know what this means you don't need it. -- cgit v1.2.3 From 15e06bf64f686befd2030da867a3dad965b96cc0 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 11 Apr 2012 00:26:25 -0600 Subject: irqdomain: Fix debugfs formatting This patch fixes the irq_domain_mapping debugfs output to pad pointer values with leading zeros so that pointer values are displayed correctly. Otherwise you get output similar to "0x 5e0000000000000". Also, when the irq_domain is set to 'null' Signed-off-by: Grant Likely Cc: David Daney Cc: Mika Westerberg --- kernel/irq/irqdomain.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9310a8d365b0..eb05e40f4553 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -643,8 +643,8 @@ static int virq_debug_show(struct seq_file *m, void *private) void *data; int i; - seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", - "chip name", "chip data", "domain name"); + seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", + "chip name", 2 * sizeof(void *) + 2, "chip data", "domain name"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); @@ -667,7 +667,7 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "%-15s ", p); data = irq_desc_get_chip_data(desc); - seq_printf(m, "0x%16p ", data); + seq_printf(m, data ? "0x%p " : " %p ", data); if (desc->irq_data.domain && desc->irq_data.domain->of_node) p = desc->irq_data.domain->of_node->full_name; -- cgit v1.2.3 From 79549c6dfda0603dba9a70a53467ce62d9335c33 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 9 Apr 2012 21:03:50 +0200 Subject: cred: copy_process() should clear child->replacement_session_keyring keyctl_session_to_parent(task) sets ->replacement_session_keyring, it should be processed and cleared by key_replace_session_keyring(). However, this task can fork before it notices TIF_NOTIFY_RESUME and the new child gets the bogus ->replacement_session_keyring copied by dup_task_struct(). This is obviously wrong and, if nothing else, this leads to put_cred(already_freed_cred). change copy_creds() to clear this member. If copy_process() fails before this point the wrong ->replacement_session_keyring doesn't matter, exit_creds() won't be called. Cc: Signed-off-by: Oleg Nesterov Acked-by: David Howells Signed-off-by: Linus Torvalds --- kernel/cred.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 97b36eeca4c9..e70683d9ec32 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -386,6 +386,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; + p->replacement_session_keyring = NULL; + if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && -- cgit v1.2.3 From 6fa6c8e25e95bdc73e92e4c96b8e3299169b616e Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 15 Feb 2012 15:06:08 -0700 Subject: irq_domain: Move irq_virq_count into NOMAP revmap This patch replaces the old global setting of irq_virq_count that is only used by the NOMAP mapping and instead uses a revmap_data property so that the maximum NOMAP allocation can be set per NOMAP irq_domain. There is exactly one user of irq_virq_count in-tree right now: PS3. Also, irq_virq_count is only useful for the NOMAP mapping. So, instead of having a single global irq_virq_count values, this change drops it entirely and added a max_irq argument to irq_domain_add_nomap(). That makes it a property of an individual nomap irq domain instead of a global system settting. Signed-off-by: Grant Likely Tested-by: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller --- kernel/irq/irqdomain.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb05e40f4553..d34413e78628 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,7 +23,6 @@ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); -static unsigned int irq_virq_count = NR_IRQS; static struct irq_domain *irq_default_domain; /** @@ -184,13 +183,16 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, } struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + unsigned int max_irq, const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_NOMAP, ops, host_data); - if (domain) + if (domain) { + domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; irq_domain_add(domain); + } return domain; } @@ -262,22 +264,6 @@ void irq_set_default_host(struct irq_domain *domain) irq_default_domain = domain; } -/** - * irq_set_virq_count() - Set the maximum number of linux irqs - * @count: number of linux irqs, capped with NR_IRQS - * - * This is mainly for use by platforms like iSeries who want to program - * the virtual irq number in the controller to avoid the reverse mapping - */ -void irq_set_virq_count(unsigned int count) -{ - pr_debug("irq: Trying to set virq count to %d\n", count); - - BUG_ON(count < NUM_ISA_INTERRUPTS); - if (count < NR_IRQS) - irq_virq_count = count; -} - static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { @@ -320,13 +306,12 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) pr_debug("irq: create_direct virq allocation failed\n"); return 0; } - if (virq >= irq_virq_count) { + if (virq >= domain->revmap_data.nomap.max_irq) { pr_err("ERROR: no free irqs available below %i maximum\n", - irq_virq_count); + domain->revmap_data.nomap.max_irq); irq_free_desc(virq); return 0; } - pr_debug("irq: create_direct obtained virq %d\n", virq); if (irq_setup_virq(domain, virq, virq)) { @@ -378,7 +363,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return irq_domain_legacy_revmap(domain, hwirq); /* Allocate a virtual interrupt number */ - hint = hwirq % irq_virq_count; + hint = hwirq % nr_irqs; if (hint == 0) hint++; virq = irq_alloc_desc_from(hint, 0); @@ -516,7 +501,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int i; - unsigned int hint = hwirq % irq_virq_count; + unsigned int hint = hwirq % nr_irqs; /* Look for default domain if nececssary */ if (domain == NULL) @@ -537,7 +522,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, if (data && (data->domain == domain) && (data->hwirq == hwirq)) return i; i++; - if (i >= irq_virq_count) + if (i >= nr_irqs) i = 1; } while(i != hint); return 0; -- cgit v1.2.3 From 026ee1f66aaa7f01b617a0ba89ac4b531f9603f1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 12 Apr 2012 12:49:17 -0700 Subject: panic: fix stack dump print on direct call to panic() Commit 6e6f0a1f0fa6 ("panic: don't print redundant backtraces on oops") causes a regression where no stack trace will be printed at all for the case where kernel code calls panic() directly while not processing an oops, and of course there are 100's of instances of this type of call. The original commit executed the check (!oops_in_progress), but this will always be false because just before the dump_stack() there is a call to bust_spinlocks(1), which does the following: void __attribute__((weak)) bust_spinlocks(int yes) { if (yes) { ++oops_in_progress; The proper way to resolve the problem that original commit tried to solve is to avoid printing a stack dump from panic() when the either of the following conditions is true: 1) TAINT_DIE has been set (this is done by oops_end()) This indicates and oops has already been printed. 2) oops_in_progress > 1 This guards against the rare case where panic() is invoked a second time, or in between oops_begin() and oops_end() Signed-off-by: Jason Wessel Cc: Andi Kleen Cc: [3.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 80aed44e345a..8ed89a175d79 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -97,7 +97,7 @@ void panic(const char *fmt, ...) /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!oops_in_progress) + if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) dump_stack(); #endif -- cgit v1.2.3 From 5269a9ab7def9a3116663347d59c4d70afa2d180 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 12 Apr 2012 14:42:15 -0600 Subject: irq_domain: fix type mismatch in debugfs output format sizeof(void*) returns an unsigned long, but it was being used as a width parameter to a "%-*s" format string which requires an int. On 64 bit platforms this causes a type mismatch: linux/kernel/irq/irqdomain.c:575: warning: field width should have type 'int', but argument 6 has type 'long unsigned int' This change casts the size to an int so printf gets the right data type. Reported-by: Andreas Schwab Signed-off-by: Grant Likely Cc: David Daney --- kernel/irq/irqdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d34413e78628..0e0ba5f840b2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -629,7 +629,8 @@ static int virq_debug_show(struct seq_file *m, void *private) int i; seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", - "chip name", 2 * sizeof(void *) + 2, "chip data", "domain name"); + "chip name", (int)(2 * sizeof(void *) + 2), "chip data", + "domain name"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); -- cgit v1.2.3 From ef1f0982540e5f79c8bbf3675bbc0a9734dba3fc Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 11 Apr 2012 12:21:39 -0400 Subject: irq_work: fix compile failure on tile from missing include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with IRQ_WORK configured results in kernel/irq_work.c: In function ‘irq_work_run’: kernel/irq_work.c:110: error: implicit declaration of function ‘irqs_disabled’ The appropriate header just needs to be included. Signed-off-by: Chris Metcalf Signed-off-by: Paul Gortmaker --- kernel/irq_work.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 0c56d44b9fd5..1588e3b2871b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -11,6 +11,7 @@ #include #include #include +#include #include /* -- cgit v1.2.3