From bf262dcec6383188a3324192c4a7e405b3b1ad23 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 12 Apr 2016 05:02:09 +0930 Subject: module: fix noreturn attribute for __module_put_and_exit() __module_put_and_exit() is makred noreturn in module.h declaration, but is lacking the attribute in the definition, which makes some tools (such as sparse) unhappy. Amend the definition with the attribute as well (and reformat the declaration so that it uses more common format). Signed-off-by: Jiri Kosina Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5f71aa63ed2a..5e876977844b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -336,7 +336,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. nfsd and lockd use this. */ -void __module_put_and_exit(struct module *mod, long code) +void __noreturn __module_put_and_exit(struct module *mod, long code) { module_put(mod); do_exit(code); -- cgit v1.2.3 From c75b590d60ffa3e31bcb9608b68006a8bab9e0ed Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 12 Apr 2016 05:03:09 +0930 Subject: module: fix redundant test. [linux-4.5-rc4/kernel/module.c:1692]: (style) Redundant condition: attr.test. '!attr.test || (attr.test && attr.test(mod))' is equivalent to '!attr.test || attr.test(mod)' This code was added like this ten years ago, in c988d2b284549 "modules: add version and srcversion to sysfs". Reported-by: David Binderman Cc: Matt Domsch Signed-off-by: Rusty Russell --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5e876977844b..9e04a4210a4a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1693,8 +1693,7 @@ static int module_add_modinfo_attrs(struct module *mod) temp_attr = mod->modinfo_attrs; for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { - if (!attr->test || - (attr->test && attr->test(mod))) { + if (!attr->test || attr->test(mod)) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); error = sysfs_create_file(&mod->mkobj.kobj, -- cgit v1.2.3 From 3205c36cf7d96024626f92d65f560035df1abcb2 Mon Sep 17 00:00:00 2001 From: Libor Pechacek Date: Wed, 13 Apr 2016 11:06:12 +0930 Subject: module: Issue warnings when tainting kernel While most of the locations where a kernel taint bit is set are accompanied with a warning message, there are two which set their bits silently. If the tainting module gets unloaded later on, it is almost impossible to tell what was the reason for setting the flag. Signed-off-by: Libor Pechacek Signed-off-by: Rusty Russell --- kernel/module.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9e04a4210a4a..0b4f3a85d4fc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2919,8 +2919,12 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return -ENOEXEC; } - if (!get_modinfo(info, "intree")) + if (!get_modinfo(info, "intree")) { + if (!test_taint(TAINT_OOT_MODULE)) + pr_warn("%s: loading out-of-tree module taints kernel.\n", + mod->name); add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); @@ -3089,6 +3093,8 @@ static int move_module(struct module *mod, struct load_info *info) static int check_module_license_and_versions(struct module *mod) { + int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); + /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -3107,6 +3113,9 @@ static int check_module_license_and_versions(struct module *mod) add_taint_module(mod, TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); + if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) + pr_warn("%s: module license taints kernel.\n", mod->name); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) -- cgit v1.2.3 From bca014caaa6130e57f69b5bf527967aa8ee70fdd Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 28 Apr 2016 09:24:01 +0930 Subject: module: Invalidate signatures on force-loaded modules Signing a module should only make it trusted by the specific kernel it was built for, not anything else. Loading a signed module meant for a kernel with a different ABI could have interesting effects. Therefore, treat all signatures as invalid when a module is force-loaded. Signed-off-by: Ben Hutchings Cc: stable@vger.kernel.org Signed-off-by: Rusty Russell --- kernel/module.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0b4f3a85d4fc..7f21ab238aa7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2686,13 +2686,18 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { int err = -ENOKEY; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const void *mod = info->hdr; - if (info->len > markerlen && + /* + * Require flags == 0, as a module with version information + * removed is no longer the module that was signed + */ + if (flags == 0 && + info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; @@ -2711,7 +2716,7 @@ static int module_sig_check(struct load_info *info) return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { return 0; } @@ -3506,7 +3511,7 @@ static int load_module(struct load_info *info, const char __user *uargs, long err; char *after_dashes; - err = module_sig_check(info); + err = module_sig_check(info, flags); if (err) goto free_copy; -- cgit v1.2.3 From 784bdf3bb694b256fcd6120b93e8947a84249a3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Jul 2016 16:32:30 +0200 Subject: futex: Assume all mappings are private on !MMU systems To quote Rick why there is no need for shared mapping on !MMU systems: |With MMU, shared futex keys need to identify the physical backing for |a memory address because it may be mapped at different addresses in |different processes (or even multiple times in the same process). |Without MMU this cannot happen. You only have physical addresses. So |the "private futex" behavior of using the virtual address as the key |is always correct (for both shared and private cases) on nommu |systems. This patch disables the FLAGS_SHARED in a way that allows the compiler to remove that code. [bigeasy: Added changelog ] Reported-by: Rich Felker Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Cc: Andrew Morton Link: http://lkml.kernel.org/r/20160729143230.GA21715@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/futex.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 33664f70e2d2..46cb3a301bc1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled; * Futex flags used to encode options to functions and preserve them across * restarts. */ -#define FLAGS_SHARED 0x01 +#ifdef CONFIG_MMU +# define FLAGS_SHARED 0x01 +#else +/* + * NOMMU does not have per process address space. Let the compiler optimize + * code away. + */ +# define FLAGS_SHARED 0x00 +#endif #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 @@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key) if (!key->both.ptr) return; + /* + * On MMU less systems futexes are always "private" as there is no per + * process address space. We need the smp wmb nevertheless - yes, + * arch/blackfin has MMU less SMP ... + */ + if (!IS_ENABLED(CONFIG_MMU)) { + smp_mb(); /* explicit smp_mb(); (B) */ + return; + } + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: ihold(key->shared.inode); /* implies smp_mb(); (B) */ @@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key) return; } + if (!IS_ENABLED(CONFIG_MMU)) + return; + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: iput(key->shared.inode); -- cgit v1.2.3 From e3f91083facb792dc8d8fd0a59639e4d6e7c0c8f Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Sat, 23 Jul 2016 14:42:37 +0530 Subject: jump_label: Make it possible for arches to invoke jump_label_init() earlier Some arches (powerpc at least) would like to invoke jump_label_init() much earlier in boot. So check static_key_initialized in order to make sure this function runs only once. LGTM-by: Ingo (http://marc.info/?l=linux-kernel&m=144049104329961&w=2) Signed-off-by: Kevin Hao Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- kernel/jump_label.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0dbea887d625..2d693be967df 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -235,6 +235,9 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; + if (static_key_initialized) + return; + jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); -- cgit v1.2.3 From 0d87d7ec22a0879d3926faa4f4f4412a5dee1fba Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 1 Aug 2016 13:49:29 -0700 Subject: perf/core: Change log level for duration warning to KERN_INFO When the perf interrupt handler exceeds a threshold warning messages are displayed on console: [12739.31793] perf interrupt took too long (2504 > 2500), lowering kernel.perf_event_max_sample_rate to 50000 [71340.165065] perf interrupt took too long (5005 > 5000), lowering kernel.perf_event_max_sample_rate to 25000 Many customers and users are confused by the message wondering if something is wrong or they need to take action to fix a problem. Since a user can not do anything to fix the issue, the message is really more informational than a warning. Adjust the log level accordingly. Signed-off-by: David Ahern Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470084569-438-1-git-send-email-dsa@cumulusnetworks.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 356a6c7cb52a..a19550d80ab1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -448,7 +448,7 @@ static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { - printk_ratelimited(KERN_WARNING + printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, -- cgit v1.2.3 From 9502514f2808d29f6f2afa1c410e7808898dede1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Jul 2016 05:59:24 +0930 Subject: module: Do a WARN_ON_ONCE() for assert module mutex not held When running with lockdep enabled, I triggered the WARN_ON() in the module code that asserts when module_mutex or rcu_read_lock_sched are not held. The issue I have is that this can also be called from the dump_stack() code, causing us to enter an infinite loop... ------------[ cut here ]------------ WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 ffff880215e8fa70 ffff880215e8fa70 ffffffff812fc8e3 0000000000000000 ffffffff81d3e55b ffff880215e8fac0 ffffffff8104fc88 ffffffff8104fcab 0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001 Call Trace: [] dump_stack+0x67/0x90 [] __warn+0xcb/0xe9 [] ? warn_slowpath_null+0x5/0x1f ------------[ cut here ]------------ WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 ffff880215e8f7a0 ffff880215e8f7a0 ffffffff812fc8e3 0000000000000000 ffffffff81d3e55b ffff880215e8f7f0 ffffffff8104fc88 ffffffff8104fcab 0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001 Call Trace: [] dump_stack+0x67/0x90 [] __warn+0xcb/0xe9 [] ? warn_slowpath_null+0x5/0x1f ------------[ cut here ]------------ WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc3-test-00013-g501c2375253c #14 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 ffff880215e8f4d0 ffff880215e8f4d0 ffffffff812fc8e3 0000000000000000 ffffffff81d3e55b ffff880215e8f520 ffffffff8104fc88 ffffffff8104fcab 0000000915e88300 0000000000000046 ffffffffa019b29a 0000000000000001 Call Trace: [] dump_stack+0x67/0x90 [] __warn+0xcb/0xe9 [] ? warn_slowpath_null+0x5/0x1f ------------[ cut here ]------------ WARNING: CPU: 1 PID: 0 at kernel/module.c:268 module_assert_mutex_or_preempt+0x3c/0x3e [...] Which gives us rather useless information. Worse yet, there's some race that causes this, and I seldom trigger it, so I have no idea what happened. This would not be an issue if that warning was a WARN_ON_ONCE(). Signed-off-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 7f21ab238aa7..beaebea627ff 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -264,7 +264,7 @@ static void module_assert_mutex_or_preempt(void) if (unlikely(!debug_locks)) return; - WARN_ON(!rcu_read_lock_sched_held() && + WARN_ON_ONCE(!rcu_read_lock_sched_held() && !lockdep_is_held(&module_mutex)); #endif } -- cgit v1.2.3 From be7de5f91fdc3a63ee01910c43f20db213445ce4 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 21 Jul 2016 15:37:56 +0930 Subject: modules: Add kernel parameter to blacklist modules Blacklisting a module in linux has long been a problem. The current procedure is to use rd.blacklist=module_name, however, that doesn't cover the case after the initramfs and before a boot prompt (where one is supposed to use /etc/modprobe.d/blacklist.conf to blacklist runtime loading). Using rd.shell to get an early prompt is hit-or-miss, and doesn't cover all situations AFAICT. This patch adds this functionality of permanently blacklisting a module by its name via the kernel parameter module_blacklist=module_name. [v2]: Rusty, use core_param() instead of __setup() which simplifies things. [v3]: Rusty, undo wreckage from strsep() [v4]: Rusty, simpler version of blacklisted() Signed-off-by: Prarit Bhargava Cc: Jonathan Corbet Cc: Rusty Russell Cc: linux-doc@vger.kernel.org Signed-off-by: Rusty Russell --- kernel/module.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index beaebea627ff..c91c2fdca2e6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3168,6 +3168,27 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, return 0; } +/* module_blacklist is a comma-separated list of module names */ +static char *module_blacklist; +static bool blacklisted(char *module_name) +{ + const char *p; + size_t len; + + if (!module_blacklist) + return false; + + for (p = module_blacklist; *p; p += len) { + len = strcspn(p, ","); + if (strlen(module_name) == len && !memcmp(module_name, p, len)) + return true; + if (p[len] == ',') + len++; + } + return false; +} +core_param(module_blacklist, module_blacklist, charp, 0400); + static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ @@ -3178,6 +3199,9 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (IS_ERR(mod)) return mod; + if (blacklisted(mod->name)) + return ERR_PTR(-EPERM); + err = check_modinfo(mod, info, flags); if (err) return ERR_PTR(err); -- cgit v1.2.3 From bdc9f373551dd3f1e6fae1618f2394ee9bc7db2e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 27 Jul 2016 12:17:35 +0930 Subject: jump_label: disable preemption around __module_text_address(). Steven reported a warning caused by not holding module_mutex or rcu_read_lock_sched: his backtrace was corrupted but a quick audit found this possible cause. It's wrong anyway... Reported-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/jump_label.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0dbea887d625..0eef93962a91 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -284,11 +284,14 @@ static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; + preempt_disable(); mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + preempt_enable(); + if (!mod) return 0; - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); return __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, -- cgit v1.2.3 From 444d13ff10fb13bc3e64859c3cf9ce43dcfeb075 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 27 Jul 2016 12:06:21 +0930 Subject: modules: add ro_after_init support Add ro_after_init support for modules by adding a new page-aligned section in the module layout (after rodata) for ro_after_init data and enabling RO protection for that section after module init runs. Signed-off-by: Jessica Yu Acked-by: Kees Cook Signed-off-by: Rusty Russell --- kernel/livepatch/core.c | 2 +- kernel/module.c | 66 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 55 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5c2bc1052691..8bbe50704621 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -309,7 +309,7 @@ static int klp_write_object_relocations(struct module *pmod, break; } - module_enable_ro(pmod); + module_enable_ro(pmod, true); return ret; } diff --git a/kernel/module.c b/kernel/module.c index c91c2fdca2e6..205a71a97852 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1857,10 +1857,11 @@ static void mod_sysfs_teardown(struct module *mod) * from modification and any data from execution. * * General layout of module is: - * [text] [read-only-data] [writable data] - * text_size -----^ ^ ^ - * ro_size ------------------------| | - * size -------------------------------------------| + * [text] [read-only-data] [ro-after-init] [writable data] + * text_size -----^ ^ ^ ^ + * ro_size ------------------------| | | + * ro_after_init_size -----------------------------| | + * size -----------------------------------------------------------| * * These values are always page-aligned (as is base) */ @@ -1883,14 +1884,24 @@ static void frob_rodata(const struct module_layout *layout, (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } +static void frob_ro_after_init(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT); +} + static void frob_writable_data(const struct module_layout *layout, int (*set_memory)(unsigned long start, int num_pages)) { BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); - BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1)); BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); - set_memory((unsigned long)layout->base + layout->ro_size, - (layout->size - layout->ro_size) >> PAGE_SHIFT); + set_memory((unsigned long)layout->base + layout->ro_after_init_size, + (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT); } /* livepatching wants to disable read-only so it can frob module. */ @@ -1898,21 +1909,26 @@ void module_disable_ro(const struct module *mod) { frob_text(&mod->core_layout, set_memory_rw); frob_rodata(&mod->core_layout, set_memory_rw); + frob_ro_after_init(&mod->core_layout, set_memory_rw); frob_text(&mod->init_layout, set_memory_rw); frob_rodata(&mod->init_layout, set_memory_rw); } -void module_enable_ro(const struct module *mod) +void module_enable_ro(const struct module *mod, bool after_init) { frob_text(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); frob_rodata(&mod->init_layout, set_memory_ro); + + if (after_init) + frob_ro_after_init(&mod->core_layout, set_memory_ro); } static void module_enable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_nx); + frob_ro_after_init(&mod->core_layout, set_memory_nx); frob_writable_data(&mod->core_layout, set_memory_nx); frob_rodata(&mod->init_layout, set_memory_nx); frob_writable_data(&mod->init_layout, set_memory_nx); @@ -1921,6 +1937,7 @@ static void module_enable_nx(const struct module *mod) static void module_disable_nx(const struct module *mod) { frob_rodata(&mod->core_layout, set_memory_x); + frob_ro_after_init(&mod->core_layout, set_memory_x); frob_writable_data(&mod->core_layout, set_memory_x); frob_rodata(&mod->init_layout, set_memory_x); frob_writable_data(&mod->init_layout, set_memory_x); @@ -1963,6 +1980,8 @@ static void disable_ro_nx(const struct module_layout *layout) frob_text(layout, set_memory_rw); frob_rodata(layout, set_memory_rw); frob_rodata(layout, set_memory_x); + frob_ro_after_init(layout, set_memory_rw); + frob_ro_after_init(layout, set_memory_x); frob_writable_data(layout, set_memory_x); } @@ -2305,6 +2324,7 @@ static void layout_sections(struct module *mod, struct load_info *info) * finder in the two loops below */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, + { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, { ARCH_SHF_SMALL | SHF_ALLOC, 0 } }; @@ -2336,7 +2356,11 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->core_layout.size = debug_align(mod->core_layout.size); mod->core_layout.ro_size = mod->core_layout.size; break; - case 3: /* whole core */ + case 2: /* RO after init */ + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_after_init_size = mod->core_layout.size; + break; + case 4: /* whole core */ mod->core_layout.size = debug_align(mod->core_layout.size); break; } @@ -2366,7 +2390,14 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->init_layout.size = debug_align(mod->init_layout.size); mod->init_layout.ro_size = mod->init_layout.size; break; - case 3: /* whole init */ + case 2: + /* + * RO after init doesn't apply to init_layout (only + * core_layout), so it just takes the value of ro_size. + */ + mod->init_layout.ro_after_init_size = mod->init_layout.ro_size; + break; + case 4: /* whole init */ mod->init_layout.size = debug_align(mod->init_layout.size); break; } @@ -3193,6 +3224,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) { /* Module within temporary copy. */ struct module *mod; + unsigned int ndx; int err; mod = setup_load_info(info, flags); @@ -3215,6 +3247,15 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* We will do a special allocation for per-cpu sections later. */ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; + /* + * Mark ro_after_init section with SHF_RO_AFTER_INIT so that + * layout_sections() can put it in the right place. + * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. + */ + ndx = find_sec(info, ".data..ro_after_init"); + if (ndx) + info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ @@ -3381,12 +3422,14 @@ static noinline int do_init_module(struct module *mod) /* Switch to core kallsyms now init is done: kallsyms may be walking! */ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif + module_enable_ro(mod, true); mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; mod->init_layout.ro_size = 0; + mod->init_layout.ro_after_init_size = 0; mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be @@ -3478,8 +3521,7 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions */ - module_enable_ro(mod); + module_enable_ro(mod, false); module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, -- cgit v1.2.3 From 97f2645f358b411ba2afb22e5966753f0ad92916 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 3 Aug 2016 13:45:50 -0700 Subject: tree-wide: replace config_enabled() with IS_ENABLED() The use of config_enabled() against config options is ambiguous. In practical terms, config_enabled() is equivalent to IS_BUILTIN(), but the author might have used it for the meaning of IS_ENABLED(). Using IS_ENABLED(), IS_BUILTIN(), IS_MODULE() etc. makes the intention clearer. This commit replaces config_enabled() with IS_ENABLED() where possible. This commit is only touching bool config options. I noticed two cases where config_enabled() is used against a tristate option: - config_enabled(CONFIG_HWMON) [ drivers/net/wireless/ath/ath10k/thermal.c ] - config_enabled(CONFIG_BACKLIGHT_CLASS_DEVICE) [ drivers/gpu/drm/gma500/opregion.c ] I did not touch them because they should be converted to IS_BUILTIN() in order to keep the logic, but I was not sure it was the authors' intention. Link: http://lkml.kernel.org/r/1465215656-20569-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Kees Cook Cc: Stas Sergeev Cc: Matt Redfearn Cc: Joshua Kinard Cc: Jiri Slaby Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Markos Chandras Cc: "Dmitry V. Levin" Cc: yu-cheng yu Cc: James Hogan Cc: Brian Gerst Cc: Johannes Berg Cc: Peter Zijlstra Cc: Al Viro Cc: Will Drewry Cc: Nikolay Martynov Cc: Huacai Chen Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Daniel Borkmann Cc: Leonid Yegoshin Cc: Rafal Milecki Cc: James Cowgill Cc: Greg Kroah-Hartman Cc: Ralf Baechle Cc: Alex Smith Cc: Adam Buchbinder Cc: Qais Yousef Cc: Jiang Liu Cc: Mikko Rapeli Cc: Paul Gortmaker Cc: Denys Vlasenko Cc: Brian Norris Cc: Hidehiro Kawai Cc: "Luis R. Rodriguez" Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Dave Hansen Cc: "Kirill A. Shutemov" Cc: Roland McGrath Cc: Paul Burton Cc: Kalle Valo Cc: Viresh Kumar Cc: Tony Wu Cc: Huaitong Han Cc: Sumit Semwal Cc: Alexei Starovoitov Cc: Juergen Gross Cc: Jason Cooper Cc: "David S. Miller" Cc: Oleg Nesterov Cc: Andrea Gelmini Cc: David Woodhouse Cc: Marc Zyngier Cc: Rabin Vincent Cc: "Maciej W. Rozycki" Cc: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 4 ++-- kernel/seccomp.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d49bfa1e53e6..1d3b7665d0be 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -585,8 +585,8 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) return -EINVAL; if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { - if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) || - !config_enabled(CONFIG_SECCOMP)) + if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + !IS_ENABLED(CONFIG_SECCOMP)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54d15eb2b701..ef6c6c3f9d8a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -347,7 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -542,7 +542,7 @@ void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; - if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; @@ -655,7 +655,7 @@ int __secure_computing(const struct seccomp_data *sd) int mode = current->seccomp.mode; int this_syscall; - if (config_enabled(CONFIG_CHECKPOINT_RESTORE) && + if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return 0; -- cgit v1.2.3 From 1f69bf9c6137602cd028c96b4f8329121ec89231 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 3 Aug 2016 13:46:36 -0700 Subject: jump_label: remove bug.h, atomic.h dependencies for HAVE_JUMP_LABEL The current jump_label.h includes bug.h for things such as WARN_ON(). This makes the header problematic for inclusion by kernel.h or any headers that kernel.h includes, since bug.h includes kernel.h (circular dependency). The inclusion of atomic.h is similarly problematic. Thus, this should make jump_label.h 'includable' from most places. Link: http://lkml.kernel.org/r/7060ce35ddd0d20b33bf170685e6b0fab816bdf2.1467837322.git.jbaron@akamai.com Signed-off-by: Jason Baron Cc: "David S. Miller" Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Heiko Carstens Cc: Joe Perches Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/jump_label.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0dbea887d625..f19aa02a8f48 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL @@ -56,6 +57,49 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) static void jump_label_update(struct static_key *key); +/* + * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h. + * The use of 'atomic_read()' requires atomic.h and its problematic for some + * kernel headers such as kernel.h and others. Since static_key_count() is not + * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok + * to have it be a function here. Similarly, for 'static_key_enable()' and + * 'static_key_disable()', which require bug.h. This should allow jump_label.h + * to be included from most/all places for HAVE_JUMP_LABEL. + */ +int static_key_count(struct static_key *key) +{ + /* + * -1 means the first static_key_slow_inc() is in progress. + * static_key_enabled() must return true, so return 1 here. + */ + int n = atomic_read(&key->enabled); + + return n >= 0 ? n : 1; +} +EXPORT_SYMBOL_GPL(static_key_count); + +void static_key_enable(struct static_key *key) +{ + int count = static_key_count(key); + + WARN_ON_ONCE(count < 0 || count > 1); + + if (!count) + static_key_slow_inc(key); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable(struct static_key *key) +{ + int count = static_key_count(key); + + WARN_ON_ONCE(count < 0 || count > 1); + + if (count) + static_key_slow_dec(key); +} +EXPORT_SYMBOL_GPL(static_key_disable); + void static_key_slow_inc(struct static_key *key) { int v, v1; @@ -235,6 +279,15 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; + /* + * Since we are initializing the static_key.enabled field with + * with the 'raw' int values (to avoid pulling in atomic.h) in + * jump_label.h, let's make sure that is safe. There are only two + * cases to check since we initialize to 0 or 1. + */ + BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); + BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); + jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); -- cgit v1.2.3 From 1eff9d322a444245c67515edb52bc0eb68374aa8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Aug 2016 15:35:16 -0600 Subject: block: rename bio bi_rw to bi_opf Since commit 63a4cc24867d, bio->bi_rw contains flags in the lower portion and the op code in the higher portions. This means that old code that relies on manually setting bi_rw is most likely going to be broken. Instead of letting that brokeness linger, rename the member, to force old and out-of-tree code to break at compile time instead of at runtime. No intended functional changes in this commit. Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb345cd11883..7598e6ca817a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -776,7 +776,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_rw, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -881,7 +881,7 @@ static void blk_add_trace_split(void *ignore, __be64 rpdu = cpu_to_be64(pdu); __blk_add_trace(bt, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, bio_op(bio), bio->bi_rw, + bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), &rpdu); } @@ -915,7 +915,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_rw, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, sizeof(r), &r); } -- cgit v1.2.3 From 574673c231a5fad1560249cc3a598907acb36cf9 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Thu, 4 Aug 2016 09:52:09 +0200 Subject: printk: Remove unnecessary #ifdef CONFIG_PRINTK In commit 874f9c7da9a4 ("printk: create pr_ functions"), new pr_level defines were added to printk.c. These new defines are guarded by an #ifdef CONFIG_PRINTK - however, there is already a surrounding #ifdef CONFIG_PRINTK starting a lot earlier in line 249 which means the newly introduced #ifdef is unnecessary. Let's remove it to avoid confusion. Signed-off-by: Andreas Ziegler Cc: Joe Perches Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a5ef95ca18c9..a37fc8cf8e84 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1930,7 +1930,6 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); -#ifdef CONFIG_PRINTK #define define_pr_level(func, loglevel) \ asmlinkage __visible void func(const char *fmt, ...) \ { \ @@ -1949,7 +1948,6 @@ define_pr_level(__pr_err, LOGLEVEL_ERR); define_pr_level(__pr_warn, LOGLEVEL_WARNING); define_pr_level(__pr_notice, LOGLEVEL_NOTICE); define_pr_level(__pr_info, LOGLEVEL_INFO); -#endif int vprintk_default(int level, const char *fmt, va_list args) { -- cgit v1.2.3 From f3b0946d629c8bfbd3e5f038e30cb9c711a35f10 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 13 Jul 2016 17:18:33 +0100 Subject: genirq/msi: Make sure PCI MSIs are activated early Bharat Kumar Gogada reported issues with the generic MSI code, where the end-point ended up with garbage in its MSI configuration (both for the vector and the message). It turns out that the two MSI paths in the kernel are doing slightly different things: generic MSI: disable MSI -> allocate MSI -> enable MSI -> setup EP PCI MSI: disable MSI -> allocate MSI -> setup EP -> enable MSI And it turns out that end-points are allowed to latch the content of the MSI configuration registers as soon as MSIs are enabled. In Bharat's case, the end-point ends up using whatever was there already, which is not what you want. In order to make things converge, we introduce a new MSI domain flag (MSI_FLAG_ACTIVATE_EARLY) that is unconditionally set for PCI/MSI. When set, this flag forces the programming of the end-point as soon as the MSIs are allocated. A consequence of this is that we have an extra activate in irq_startup, but that should be without much consequence. tglx: - Several people reported a VMWare regression with PCI/MSI-X passthrough. It turns out that the patch also cures that issue. - We need to have a look at the MSI disable interrupt path, where we write the msg to all zeros without disabling MSI in the PCI device. Is that correct? Fixes: 52f518a3a7c2 "x86/MSI: Use hierarchical irqdomains to manage MSI interrupts" Reported-and-tested-by: Bharat Kumar Gogada Reported-and-tested-by: Foster Snowhill Reported-by: Matthias Prager Reported-by: Jason Taylor Signed-off-by: Marc Zyngier Acked-by: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1468426713-31431-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 54999350162c..19e9dfbe97fa 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, else dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { + struct irq_data *irq_data; + + irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_domain_activate_irq(irq_data); + } } return 0; -- cgit v1.2.3 From 46c8f0b077a838eb1f6169bb370aab8ed98f7630 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 8 Aug 2016 16:29:07 -0400 Subject: timers: Fix get_next_timer_interrupt() computation The tick_nohz_stop_sched_tick() routine is not properly canceling the sched timer when nothing is pending, because get_next_timer_interrupt() is no longer returning KTIME_MAX in that case. This causes periodic interrupts when none are needed. When determining the next interrupt time, we first use __next_timer_interrupt() to get the first expiring timer in the timer wheel. If no timer is found, we return the base clock value plus NEXT_TIMER_MAX_DELTA to indicate there is no timer in the timer wheel. Back in get_next_timer_interrupt(), we set the "expires" value by converting the timer wheel expiry (in ticks) to a nsec value. But we don't want to do this if the timer wheel expiry value indicates no timer; we want to return KTIME_MAX. Prior to commit 500462a9de65 ("timers: Switch to a non-cascading wheel") we checked base->active_timers to see if any timers were active, and if not, we didn't touch the expiry value and so properly returned KTIME_MAX. Now we don't have active_timers. To fix this, we now just check the timer wheel expiry value to see if it is "now + NEXT_TIMER_MAX_DELTA", and if it is, we don't try to compute a new value based on it, but instead simply let the KTIME_MAX value in expires remain. Fixes: 500462a9de65 "timers: Switch to a non-cascading wheel" Signed-off-by: Chris Metcalf Cc: Frederic Weisbecker Cc: Christoph Lameter Cc: John Stultz Link: http://lkml.kernel.org/r/1470688147-22287-1-git-send-email-cmetcalf@mellanox.com Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 555670a5143c..32bf6f75a8fe 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; + bool is_max_delta; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); + is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* * We have a fresh next event. Check whether we can forward the base: @@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) expires = basem; base->is_idle = false; } else { - expires = basem + (nextevt - basej) * TICK_NSEC; + if (!is_max_delta) + expires = basem + (nextevt - basej) * TICK_NSEC; /* * If we expect to sleep more than a tick, mark the base idle: */ -- cgit v1.2.3 From a0cba2179ea4c1820fce2ee046b6ed90ecc56196 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Aug 2016 10:48:18 -0700 Subject: Revert "printk: create pr_ functions" This reverts commit 874f9c7da9a4acbc1b9e12ca722579fb50e4d142. Geert Uytterhoeven reports: "This change seems to have an (unintendent?) side-effect. Before, pr_*() calls without a trailing newline characters would be printed with a newline character appended, both on the console and in the output of the dmesg command. After this commit, no new line character is appended, and the output of the next pr_*() call of the same type may be appended, like in: - Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000 - Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM) + Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)" Joe Perches says: "No, that is not intentional. The newline handling code inside vprintk_emit is a bit involved and for now I suggest a revert until this has all the same behavior as earlier" Reported-by: Geert Uytterhoeven Requested-by: Joe Perches Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/internal.h | 16 ++++++---------- kernel/printk/nmi.c | 13 ++----------- kernel/printk/printk.c | 25 +++---------------------- 3 files changed, 11 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 5d4505f30083..7fd2838fa417 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -16,11 +16,9 @@ */ #include -typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt, - va_list args); +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); -__printf(2, 0) -int vprintk_default(int level, const char *fmt, va_list args); +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI @@ -33,10 +31,9 @@ extern raw_spinlock_t logbuf_lock; * via per-CPU variable. */ DECLARE_PER_CPU(printk_func_t, printk_func); -__printf(2, 0) -static inline int vprintk_func(int level, const char *fmt, va_list args) +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { - return this_cpu_read(printk_func)(level, fmt, args); + return this_cpu_read(printk_func)(fmt, args); } extern atomic_t nmi_message_lost; @@ -47,10 +44,9 @@ static inline int get_nmi_message_lost(void) #else /* CONFIG_PRINTK_NMI */ -__printf(2, 0) -static inline int vprintk_func(int level, const char *fmt, va_list args) +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { - return vprintk_default(level, fmt, args); + return vprintk_default(fmt, args); } static inline int get_nmi_message_lost(void) diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index bc3eeb1ae6da..b69eb8a2876f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); * one writer running. But the buffer might get flushed from another * CPU, so we need to be careful. */ -static int vprintk_nmi(int level, const char *fmt, va_list args) +static int vprintk_nmi(const char *fmt, va_list args) { struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); int add = 0; @@ -79,16 +79,7 @@ again: if (!len) smp_rmb(); - if (level != LOGLEVEL_DEFAULT) { - add = snprintf(s->buffer + len, sizeof(s->buffer) - len, - KERN_SOH "%c", '0' + level); - add += vsnprintf(s->buffer + len + add, - sizeof(s->buffer) - len - add, - fmt, args); - } else { - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, - fmt, args); - } + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); /* * Do it once again if the buffer has been flushed in the meantime. diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a37fc8cf8e84..eea6dbc2d8cf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1930,26 +1930,7 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); -#define define_pr_level(func, loglevel) \ -asmlinkage __visible void func(const char *fmt, ...) \ -{ \ - va_list args; \ - \ - va_start(args, fmt); \ - vprintk_default(loglevel, fmt, args); \ - va_end(args); \ -} \ -EXPORT_SYMBOL(func) - -define_pr_level(__pr_emerg, LOGLEVEL_EMERG); -define_pr_level(__pr_alert, LOGLEVEL_ALERT); -define_pr_level(__pr_crit, LOGLEVEL_CRIT); -define_pr_level(__pr_err, LOGLEVEL_ERR); -define_pr_level(__pr_warn, LOGLEVEL_WARNING); -define_pr_level(__pr_notice, LOGLEVEL_NOTICE); -define_pr_level(__pr_info, LOGLEVEL_INFO); - -int vprintk_default(int level, const char *fmt, va_list args) +int vprintk_default(const char *fmt, va_list args) { int r; @@ -1959,7 +1940,7 @@ int vprintk_default(int level, const char *fmt, va_list args) return r; } #endif - r = vprintk_emit(0, level, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); return r; } @@ -1992,7 +1973,7 @@ asmlinkage __visible int printk(const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args); + r = vprintk_func(fmt, args); va_end(args); return r; -- cgit v1.2.3 From 0b8f1e2e26bfc6b9abe3f0f3faba2cb0eecb9fb9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Aug 2016 14:37:24 +0200 Subject: perf/core: Fix sideband list-iteration vs. event ordering NULL pointer deference crash Vegard Nossum reported that perf fuzzing generates a NULL pointer dereference crash: > Digging a bit deeper into this, it seems the event itself is getting > created by perf_event_open() and it gets added to the pmu_event_list > through: > > perf_event_open() > - perf_event_alloc() > - account_event() > - account_pmu_sb_event() > - attach_sb_event() > > so at this point the event is being attached but its ->ctx is still > NULL. It seems like ->ctx is set just a bit later in > perf_event_open(), though. > > But before that, __schedule() comes along and creates a stack trace > similar to the one above: > > __schedule() > - __perf_event_task_sched_out() > - perf_iterate_sb() > - perf_iterate_sb_cpu() > - event_filter_match() > - perf_cgroup_match() > - __get_cpu_context() > - (dereference ctx which is NULL) > > So I guess the question is... should the event be attached (= put on > the list) before ->ctx gets set? Or should the cgroup code check for a > NULL ->ctx? The latter seems like the simplest solution. Moving the list-add later creates a bit of a mess. Reported-by: Vegard Nossum Tested-by: Vegard Nossum Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Fixes: f2fb6bef9251 ("perf/core: Optimize side-band event delivery") Link: http://lkml.kernel.org/r/20160804123724.GN6862@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a19550d80ab1..87d02b8cb87e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1716,8 +1716,8 @@ static inline int pmu_filter_match(struct perf_event *event) static inline int event_filter_match(struct perf_event *event) { - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event) && pmu_filter_match(event); + return (event->cpu == -1 || event->cpu == smp_processor_id()) && + perf_cgroup_match(event) && pmu_filter_match(event); } static void @@ -1737,8 +1737,8 @@ event_sched_out(struct perf_event *event, * maintained, otherwise bogus information is return * via read() for time_enabled, time_running: */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { + if (event->state == PERF_EVENT_STATE_INACTIVE && + !event_filter_match(event)) { delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; @@ -2236,10 +2236,15 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); - event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; + /* + * Ensures that if we can observe event->ctx, both the event and ctx + * will be 'complete'. See perf_iterate_sb_cpu(). + */ + smp_store_release(&event->ctx, ctx); + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; @@ -5969,6 +5974,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) struct perf_event *event; list_for_each_entry_rcu(event, &pel->list, sb_list) { + /* + * Skip events that are not fully formed yet; ensure that + * if we observe event->ctx, both event and ctx will be + * complete enough. See perf_install_in_context(). + */ + if (!smp_load_acquire(&event->ctx)) + continue; + if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) -- cgit v1.2.3 From db4a835601b73cf8d6cd8986381d966b8e13d2d9 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Tue, 2 Aug 2016 00:48:12 -0700 Subject: perf/core: Set cgroup in CPU contexts for new cgroup events There's a perf stat bug easy to observer on a machine with only one cgroup: $ perf stat -e cycles -I 1000 -C 0 -G / # time counts unit events 1.000161699 cycles / 2.000355591 cycles / 3.000565154 cycles / 4.000951350 cycles / We'd expect some output there. The underlying problem is that there is an optimization in perf_cgroup_sched_{in,out}() that skips the switch of cgroup events if the old and new cgroups in a task switch are the same. This optimization interacts with the current code in two ways that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a cgroup event matches the current task. These are: 1. On creation of the first cgroup event in a CPU: In current code, cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the aforesaid optimization, perf_cgroup_sched_in will run until the next cgroup switches in that CPU. This may happen late or never happen, depending on system's number of cgroups, CPU load, etc. 2. On deletion of the last cgroup event in a cpuctx: In list_del_event, cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in because cpuctx->cgrp == NULL until a cgroup switch occurs and perf_cgroup_sched_in is executed (updating cpuctx->cgrp). This patch fixes both problems by setting cpuctx->cgrp in list_add_event, mirroring what list_del_event does when removing a cgroup event from CPU context, as introduced in: commit 68cacd29167b ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()") With this patch, cpuctx->cgrp is always set/clear when installing/removing the first/last cgroup event in/from the CPU context. With cpuctx->cgrp correctly set, event_filter_match works as intended when events are sched in/out. After the fix, the output is as expected: $ perf stat -e cycles -I 1000 -a -G / # time counts unit events 1.004699159 627342882 cycles / 2.007397156 615272690 cycles / 3.010019057 616726074 cycles / Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vince Weaver Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 54 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 87d02b8cb87e..1903b8f3a705 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event, } } } + +/* + * Update cpuctx->cgrp so that it is set when first cgroup event is added and + * cleared when last cgroup event is removed. + */ +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) + return; + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + /* + * Because cgroup events are always per-cpu events, + * this will always be called from the right CPU. + */ + cpuctx = __get_cpu_context(ctx); + cpuctx->cgrp = add ? event->cgrp : NULL; +} + #else /* !CONFIG_CGROUP_PERF */ static inline bool @@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { } + +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ +} + #endif /* @@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); @@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) - ctx->nr_cgroups++; + list_update_cgroup_event(event, ctx, true); list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; @@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx; - WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - /* - * Because cgroup events are always per-cpu events, this will - * always be called from the right CPU. - */ - cpuctx = __get_cpu_context(ctx); - /* - * If there are no more cgroup events then clear cgrp to avoid - * stale pointer in update_cgrp_time_from_cpuctx(). - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } + list_update_cgroup_event(event, ctx, false); ctx->nr_events--; if (event->attr.inherit_stat) -- cgit v1.2.3 From a23eadfae2fd45536a355b785d5a1533e1955c22 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Tue, 19 Jul 2016 11:44:50 +0200 Subject: sched/deadline: Fix wrap-around in DL heap Current code in cpudeadline.c has a bug in re-heapifying when adding a new element at the end of the heap, because a deadline value of 0 is temporarily set in the new elem, then cpudl_change_key() is called with the actual elem deadline as param. However, the function compares the new deadline to set with the one previously in the elem, which is 0. So, if current absolute deadlines grew so much to have negative values as s64, the comparison in cpudl_change_key() makes the wrong decision. Instead, as from dl_time_before(), the kernel should handle correctly abs deadlines wrap-arounds. This patch fixes the problem with a minimally invasive change that forces cpudl_change_key() to heapify up in this case. Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Luca Abeni Cc: Juri Lelli Cc: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1468921493-10054-2-git-send-email-tommaso.cucinotta@sssup.it Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5be58820465c..d4184498c9f5 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) if (old_idx == IDX_INVALID) { cp->size++; - cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].dl = dl; cp->elements[cp->size - 1].cpu = cpu; cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); -- cgit v1.2.3 From b8922125e4790fa237a8a4204562ecf457ef54bb Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Sat, 9 Jul 2016 15:54:22 +0800 Subject: sched/fair: Fix typo in sync_throttle() We should update cfs_rq->throttled_clock_task, not pcfs_rq->throttle_clock_task. The effects of this bug was probably occasionally erratic group scheduling, particularly in cgroups-intense workloads. Signed-off-by: Xunlei Pang [ Added changelog. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 55e16d30bd99 ("sched/fair: Rework throttle_count sync") Link: http://lkml.kernel.org/r/1468050862-18864-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4088eedea763..039de34f1521 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ -- cgit v1.2.3 From 6075620b0590eaf22f10ce88833eb20a57f760d6 Mon Sep 17 00:00:00 2001 From: Giovanni Gherdovich Date: Fri, 5 Aug 2016 10:21:56 +0200 Subject: sched/cputime: Mitigate performance regression in times()/clock_gettime() Commit: 6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") fixed a problem whereby clock_nanosleep() followed by clock_gettime() could allow a task to wake early. It addressed the problem by calling the scheduling classes update_curr() when the cputimer starts. Said change induced a considerable performance regression on the syscalls times() and clock_gettimes(CLOCK_PROCESS_CPUTIME_ID). There are some debuggers and applications that monitor their own performance that accidentally depend on the performance of these specific calls. This patch mitigates the performace loss by prefetching data in the CPU cache, as stalls due to cache misses appear to be where most time is spent in our benchmarks. Here are the performance gain of this patch over v4.7-rc7 on a Sandy Bridge box with 32 logical cores and 2 NUMA nodes. The test is repeated with a variable number of threads, from 2 to 4*num_cpus; the results are in seconds and correspond to the average of 10 runs; the percentage gain is computed with (before-after)/before so a positive value is an improvement (it's faster). The improvement varies between a few percents for 5-20 threads and more than 10% for 2 or >20 threads. pound_clock_gettime: threads 4.7-rc7 patched 4.7-rc7 [num] [secs] [secs (percent)] 2 3.48 3.06 ( 11.83%) 5 3.33 3.25 ( 2.40%) 8 3.37 3.26 ( 3.30%) 12 3.32 3.37 ( -1.60%) 21 4.01 3.90 ( 2.74%) 30 3.63 3.36 ( 7.41%) 48 3.71 3.11 ( 16.27%) 79 3.75 3.16 ( 15.74%) 110 3.81 3.25 ( 14.80%) 128 3.88 3.31 ( 14.76%) pound_times: threads 4.7-rc7 patched 4.7-rc7 [num] [secs] [secs (percent)] 2 3.65 3.25 ( 11.03%) 5 3.45 3.17 ( 7.92%) 8 3.52 3.22 ( 8.69%) 12 3.29 3.36 ( -2.04%) 21 4.07 3.92 ( 3.78%) 30 3.87 3.40 ( 12.17%) 48 3.79 3.16 ( 16.61%) 79 3.88 3.28 ( 15.42%) 110 3.90 3.38 ( 13.35%) 128 4.00 3.38 ( 15.45%) pound_clock_gettime and pound_clock_gettime are two benchmarks included in the MMTests framework. They launch a given number of threads which repeatedly call times() or clock_gettimes(). The results above can be reproduced with cloning MMTests from github.com and running the "poundtime" workload: $ git clone https://github.com/gormanm/mmtests.git $ cd mmtests $ cp configs/config-global-dhp__workload_poundtime config $ ./run-mmtests.sh --run-monitor $(uname -r) The above will run "poundtime" measuring the kernel currently running on the machine; Once a new kernel is installed and the machine rebooted, running again $ cd mmtests $ ./run-mmtests.sh --run-monitor $(uname -r) will produce results to compare with. A comparison table will be output with: $ cd mmtests/work/log $ ../../compare-kernels.sh the table will contain a lot of entries; grepping for "Amean" (as in "arithmetic mean") will give the tables presented above. The source code for the two benchmarks is reported at the end of this changelog for clairity. The cache misses addressed by this patch were found using a combination of `perf top`, `perf record` and `perf annotate`. The incriminated lines were found to be struct sched_entity *curr = cfs_rq->curr; and delta_exec = now - curr->exec_start; in the function update_curr() from kernel/sched/fair.c. This patch prefetches the data from memory just before update_curr is called in the interested execution path. A comparison of the total number of cycles before and after the patch follows; the data is obtained using `perf stat -r 10 -ddd ` running over the same sequence of number of threads used above (a positive gain is an improvement): threads cycles before cycles after gain 2 19,699,563,964 +-1.19% 17,358,917,517 +-1.85% 11.88% 5 47,401,089,566 +-2.96% 45,103,730,829 +-0.97% 4.85% 8 80,923,501,004 +-3.01% 71,419,385,977 +-0.77% 11.74% 12 112,326,485,473 +-0.47% 110,371,524,403 +-0.47% 1.74% 21 193,455,574,299 +-0.72% 180,120,667,904 +-0.36% 6.89% 30 315,073,519,013 +-1.64% 271,222,225,950 +-1.29% 13.92% 48 321,969,515,332 +-1.48% 273,353,977,321 +-1.16% 15.10% 79 337,866,003,422 +-0.97% 289,462,481,538 +-1.05% 14.33% 110 338,712,691,920 +-0.78% 290,574,233,170 +-0.77% 14.21% 128 348,384,794,006 +-0.50% 292,691,648,206 +-0.66% 15.99% A comparison of cache miss vs total cache loads ratios, before and after the patch (again from the `perf stat -r 10 -ddd ` tables): threads L1 misses/total*100 L1 misses/total*100 gain before after 2 7.43 +-4.90% 7.36 +-4.70% 0.94% 5 13.09 +-4.74% 13.52 +-3.73% -3.28% 8 13.79 +-5.61% 12.90 +-3.27% 6.45% 12 11.57 +-2.44% 8.71 +-1.40% 24.72% 21 12.39 +-3.92% 9.97 +-1.84% 19.53% 30 13.91 +-2.53% 11.73 +-2.28% 15.67% 48 13.71 +-1.59% 12.32 +-1.97% 10.14% 79 14.44 +-0.66% 13.40 +-1.06% 7.20% 110 15.86 +-0.50% 14.46 +-0.59% 8.83% 128 16.51 +-0.32% 15.06 +-0.78% 8.78% As a final note, the following shows the evolution of performance figures in the "poundtime" benchmark and pinpoints commit 6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") as a major source of degradation, mostly unaddressed to this day (figures expressed in seconds). pound_clock_gettime: threads parent of 6e998916dfe3 4.7-rc7 6e998916dfe3 itself 2 2.23 3.68 ( -64.56%) 3.48 (-55.48%) 5 2.83 3.78 ( -33.42%) 3.33 (-17.43%) 8 2.84 4.31 ( -52.12%) 3.37 (-18.76%) 12 3.09 3.61 ( -16.74%) 3.32 ( -7.17%) 21 3.14 4.63 ( -47.36%) 4.01 (-27.71%) 30 3.28 5.75 ( -75.37%) 3.63 (-10.80%) 48 3.02 6.05 (-100.56%) 3.71 (-22.99%) 79 2.88 6.30 (-118.90%) 3.75 (-30.26%) 110 2.95 6.46 (-119.00%) 3.81 (-29.24%) 128 3.05 6.42 (-110.08%) 3.88 (-27.04%) pound_times: threads parent of 6e998916dfe3 4.7-rc7 6e998916dfe3 itself 2 2.27 3.73 ( -64.71%) 3.65 (-61.14%) 5 2.78 3.77 ( -35.56%) 3.45 (-23.98%) 8 2.79 4.41 ( -57.71%) 3.52 (-26.05%) 12 3.02 3.56 ( -17.94%) 3.29 ( -9.08%) 21 3.10 4.61 ( -48.74%) 4.07 (-31.34%) 30 3.33 5.75 ( -72.53%) 3.87 (-16.01%) 48 2.96 6.06 (-105.04%) 3.79 (-28.10%) 79 2.88 6.24 (-116.83%) 3.88 (-34.81%) 110 2.98 6.37 (-114.08%) 3.90 (-31.12%) 128 3.10 6.35 (-104.61%) 4.00 (-28.87%) The source code of the two benchmarks follows. To compile the two: NR_THREADS=42 for FILE in pound_times pound_clock_gettime; do gcc -lrt -O2 -lpthread -DNUM_THREADS=$NR_THREADS $FILE.c -o $FILE done ==== BEGIN pound_times.c ==== struct tms start; void *pound (void *threadid) { struct tms end; int oldutime = 0; int utime; int i; for (i = 0; i < 5000000 / NUM_THREADS; i++) { times(&end); utime = ((int)end.tms_utime - (int)start.tms_utime); if (oldutime > utime) { printf("utime decreased, was %d, now %d!\n", oldutime, utime); } oldutime = utime; } pthread_exit(NULL); } int main() { pthread_t th[NUM_THREADS]; long i; times(&start); for (i = 0; i < NUM_THREADS; i++) { pthread_create (&th[i], NULL, pound, (void *)i); } pthread_exit(NULL); return 0; } ==== END pound_times.c ==== ==== BEGIN pound_clock_gettime.c ==== void *pound (void *threadid) { struct timespec ts; int rc, i; unsigned long prev = 0, this = 0; for (i = 0; i < 5000000 / NUM_THREADS; i++) { rc = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); if (rc < 0) perror("clock_gettime"); this = (ts.tv_sec * 1000000000) + ts.tv_nsec; if (0 && this < prev) printf("%lu ns timewarp at iteration %d\n", prev - this, i); prev = this; } pthread_exit(NULL); } int main() { pthread_t th[NUM_THREADS]; long rc, i; pid_t pgid; for (i = 0; i < NUM_THREADS; i++) { rc = pthread_create(&th[i], NULL, pound, (void *)i); if (rc < 0) perror("pthread_create"); } pthread_exit(NULL); return 0; } ==== END pound_clock_gettime.c ==== Suggested-by: Mike Galbraith Signed-off-by: Giovanni Gherdovich Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470385316-15027-2-git-send-email-ggherdovich@suse.cz Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c883fe8e440..2a906f20fba7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -2971,6 +2972,23 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); +/* + * The function fair_sched_class.update_curr accesses the struct curr + * and its field curr->exec_start; when called from task_sched_runtime(), + * we observe a high rate of cache misses in practice. + * Prefetching this data results in improved performance. + */ +static inline void prefetch_curr_exec_start(struct task_struct *p) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *curr = (&p->se)->cfs_rq->curr; +#else + struct sched_entity *curr = (&task_rq(p)->cfs)->curr; +#endif + prefetch(curr); + prefetch(&curr->exec_start); +} + /* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's @@ -3005,6 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * thread, breaking clock_gettime(). */ if (task_current(rq, p) && task_on_rq_queued(p)) { + prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); } -- cgit v1.2.3 From c0c8c9fa210c9a042060435f17e40ba4a76d6d6f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 4 Aug 2016 09:42:20 +0800 Subject: sched/deadline: Fix lock pinning warning during CPU hotplug The following warning can be triggered by hot-unplugging the CPU on which an active SCHED_DEADLINE task is running on: WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:3531 lock_release+0x690/0x6a0 releasing a pinned lock Call Trace: dump_stack+0x99/0xd0 __warn+0xd1/0xf0 ? dl_task_timer+0x1a1/0x2b0 warn_slowpath_fmt+0x4f/0x60 ? sched_clock+0x13/0x20 lock_release+0x690/0x6a0 ? enqueue_pushable_dl_task+0x9b/0xa0 ? enqueue_task_dl+0x1ca/0x480 _raw_spin_unlock+0x1f/0x40 dl_task_timer+0x1a1/0x2b0 ? push_dl_task.part.31+0x190/0x190 WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:3649 lock_unpin_lock+0x181/0x1a0 unpinning an unpinned lock Call Trace: dump_stack+0x99/0xd0 __warn+0xd1/0xf0 warn_slowpath_fmt+0x4f/0x60 lock_unpin_lock+0x181/0x1a0 dl_task_timer+0x127/0x2b0 ? push_dl_task.part.31+0x190/0x190 As per the comment before this code, its safe to drop the RQ lock here, and since we (potentially) change rq, unpin and repin to avoid the splat. Signed-off-by: Wanpeng Li [ Rewrote changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470274940-17976-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fcb7f0217ff4..1ce8867283dc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * * XXX figure out if select_task_rq_dl() deals with offline cpus. */ - if (unlikely(!rq->online)) + if (unlikely(!rq->online)) { + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); + rf.cookie = lockdep_pin_lock(&rq->lock); + } /* * Queueing this task back might have overloaded rq, check if we need -- cgit v1.2.3 From 229ce631574761870a2ac938845fadbd07f35caa Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 14 Jul 2016 16:15:56 +0800 Subject: locking/pvqspinlock: Fix double hash race When the lock holder vCPU is racing with the queue head: CPU 0 (lock holder) CPU1 (queue head) =================== ================= spin_lock(); spin_lock(); pv_kick_node(): pv_wait_head_or_lock(): if (!lp) { lp = pv_hash(lock, pn); xchg(&l->locked, _Q_SLOW_VAL); } WRITE_ONCE(pn->state, vcpu_halted); cmpxchg(&pn->state, vcpu_halted, vcpu_hashed); WRITE_ONCE(l->locked, _Q_SLOW_VAL); (void)pv_hash(lock, pn); In this case, lock holder inserts the pv_node of queue head into the hash table and set _Q_SLOW_VAL unnecessary. This patch avoids it by restoring/setting vcpu_hashed state after failing adaptive locking spinning. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Pan Xinhui Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Link: http://lkml.kernel.org/r/1468484156-4521-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 37649e69056c..8a99abf58080 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) goto gotlock; } } - WRITE_ONCE(pn->state, vcpu_halted); + WRITE_ONCE(pn->state, vcpu_hashed); qstat_inc(qstat_pv_wait_head, true); qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); -- cgit v1.2.3 From c2ace36b884de9330c4149064ae8d212d2e0d9ee Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Wed, 13 Jul 2016 18:23:34 +0800 Subject: locking/pvqspinlock: Fix a bug in qstat_read() It's obviously wrong to set stat to NULL. So lets remove it. Otherwise it is always zero when we check the latency of kick/wake. Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1468405414-3700-1-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_stat.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 22e025309845..b9d031516254 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, */ if ((counter == qstat_pv_latency_kick) || (counter == qstat_pv_latency_wake)) { - stat = 0; if (kicks) stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); } -- cgit v1.2.3 From f9bcf1e0e0145323ba2cf72ecad5264ff3883eb1 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 11 Aug 2016 13:36:35 +0800 Subject: sched/cputime: Fix steal time accounting Commit: 57430218317 ("sched/cputime: Count actually elapsed irq & softirq time") ... didn't take steal time into consideration with passing the noirqtime kernel parameter. As Paolo pointed out before: | Why not? If idle=poll, for example, any time the guest is suspended (and | thus cannot poll) does count as stolen time. This patch fixes it by reducing steal time from idle time accounting when the noirqtime parameter is true. The average idle time drops from 56.8% to 54.75% for nohz idle kvm guest(noirqtime, idle=poll, four vCPUs running on one pCPU). Signed-off-by: Wanpeng Li Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470893795-3527-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 1934f658c036..8b9bcc5a58fa 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -508,13 +508,20 @@ void account_process_tick(struct task_struct *p, int user_tick) */ void account_idle_ticks(unsigned long ticks) { - + cputime_t cputime, steal; if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - account_idle_time(jiffies_to_cputime(ticks)); + cputime = cputime_one_jiffy; + steal = steal_account_process_time(cputime); + + if (steal >= cputime) + return; + + cputime -= steal; + account_idle_time(cputime); } /* -- cgit v1.2.3 From 26f2c75cd2cf10a6120ef02ca9a94db77cc9c8e0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Aug 2016 14:58:24 +0200 Subject: sched/cputime: Fix omitted ticks passed in parameter Commit: f9bcf1e0e014 ("sched/cputime: Fix steal time accounting") ... fixes a leak on steal time accounting but forgets to account the ticks passed in parameters, assuming there is only one to take into account. Let's consider that parameter back. Signed-off-by: Frederic Weisbecker Acked-by: Wanpeng Li Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Cc: linux-tip-commits@vger.kernel.org Link: http://lkml.kernel.org/r/20160811125822.GB4214@lerouge Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8b9bcc5a58fa..9858266fb0b3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -509,12 +509,13 @@ void account_process_tick(struct task_struct *p, int user_tick) void account_idle_ticks(unsigned long ticks) { cputime_t cputime, steal; + if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - cputime = cputime_one_jiffy; + cputime = jiffies_to_cputime(ticks); steal = steal_account_process_time(cputime); if (steal >= cputime) -- cgit v1.2.3 From 62822e2ec4ad091ba31f823f577ef80db52e3c2c Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 11 Aug 2016 14:49:29 -0700 Subject: PM / hibernate: Restore processor state before using per-CPU variables Restore the processor state before calling any other functions to ensure per-CPU variables can be used with KASLR memory randomization. Tracing functions use per-CPU variables (GS based on x86) and one was called just before restoring the processor state fully. It resulted in a double fault when both the tracing & the exception handler functions tried to use a per-CPU variable. Fixes: bb3632c6101b (PM / sleep: trace events for suspend/resume) Reported-and-tested-by: Borislav Petkov Reported-by: Jiri Kosina Tested-by: Rafael J. Wysocki Tested-by: Jiri Kosina Signed-off-by: Thomas Garnier Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0ee1df0a0bd6..61761aa7cc19 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -300,12 +300,12 @@ static int create_image(int platform_mode) save_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); error = swsusp_arch_suspend(); + /* Restore control flow magically appears here */ + restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) printk(KERN_ERR "PM: Error %d creating hibernation image\n", error); - /* Restore control flow magically appears here */ - restore_processor_state(); if (!in_suspend) events_check_enabled = false; -- cgit v1.2.3 From 747ea55e4f78fd980350c39570a986b8c1c3e4aa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 12 Aug 2016 22:17:17 +0200 Subject: bpf: fix bpf_skb_in_cgroup helper naming While hashing out BPF's current_task_under_cgroup helper bits, it came to discussion that the skb_in_cgroup helper name was suboptimally chosen. Tejun says: So, I think in_cgroup should mean that the object is in that particular cgroup while under_cgroup in the subhierarchy of that cgroup. Let's rename the other subhierarchy test to under too. I think that'd be a lot less confusing going forward. [...] It's more intuitive and gives us the room to implement the real "in" test if ever necessary in the future. Since this touches uapi bits, we need to change this as long as v4.8 is not yet officially released. Thus, change the helper enum and rename related bits. Fixes: 4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto") Reference: http://patchwork.ozlabs.org/patch/658500/ Suggested-by: Sargun Dhillon Suggested-by: Tejun Heo Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7094c69ac199..daea765d72e6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1053,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - if (func_id != BPF_FUNC_skb_in_cgroup) + if (func_id != BPF_FUNC_skb_under_cgroup) goto error; break; default: @@ -1075,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; - case BPF_FUNC_skb_in_cgroup: + case BPF_FUNC_skb_under_cgroup: if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; break; -- cgit v1.2.3