From 4c737b41de7f4eef2a593803bad1b918dd718b10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: cgroup: make cgroup_path() and friends behave in the style of strlcpy() cgroup_path() and friends used to format the path from the end and thus the resulting path usually didn't start at the start of the passed in buffer. Also, when the buffer was too small, the partial result was truncated from the head rather than tail and there was no way to tell how long the full path would be. These make the functions less robust and more awkward to use. With recent updates to kernfs_path(), cgroup_path() and friends can be made to behave in strlcpy() style. * cgroup_path(), cgroup_path_ns[_locked]() and task_cgroup_path() now always return the length of the full path. If buffer is too small, it contains nul terminated truncated output. * All users updated accordingly. v2: cgroup_path() usage in kernel/sched/debug.c converted. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: Serge Hallyn Cc: Peter Zijlstra --- kernel/cgroup.c | 48 ++++++++++++++++++++++-------------------------- kernel/cpuset.c | 12 ++++++------ kernel/sched/debug.c | 3 ++- 3 files changed, 30 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..3861161e460f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2315,22 +2315,18 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); - int ret; - ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; + return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } -char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { - char *ret; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2357,12 +2353,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * * Return value is the same as kernfs_path(). */ -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; - char *path = NULL; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2371,16 +2367,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - if (strlcpy(buf, "/", buflen) < buflen) - path = buf; + ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - return path; + return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -5716,7 +5711,7 @@ core_initcall(cgroup_wq_init); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *path; + char *buf; int retval; struct cgroup_root *root; @@ -5759,18 +5754,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (!path) { + if (retval >= PATH_MAX) { retval = -ENAMETOOLONG; goto out_unlock; } + + seq_puts(m, buf); } else { - path = "/"; + seq_puts(m, "/"); } - seq_puts(m, path); - if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else @@ -6035,8 +6030,9 @@ static void cgroup_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL, *path; + char *pathbuf = NULL, *agentbuf = NULL; char *argv[3], *envp[3]; + int ret; mutex_lock(&cgroup_mutex); @@ -6046,13 +6042,13 @@ static void cgroup_release_agent(struct work_struct *work) goto out; spin_lock_irq(&css_set_lock); - path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (!path) + if (ret >= PATH_MAX) goto out; argv[0] = agentbuf; - argv[1] = path; + argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c7fd2778ed50..793ae6fd96dc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2689,7 +2689,7 @@ void __cpuset_memory_pressure_bump(void) int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *p; + char *buf; struct cgroup_subsys_state *css; int retval; @@ -2700,18 +2700,18 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); - p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); css_put(css); - if (!p) + if (retval >= PATH_MAX) goto out_free; - seq_puts(m, p); + seq_puts(m, buf); seq_putc(m, '\n'); retval = 0; out_free: kfree(buf); out: - return retval; + return 0; } #endif /* CONFIG_PROC_PID_CPUSET */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..23cb609ba4eb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -410,7 +410,8 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif -- cgit v1.2.3 From ed1777de25e45bfb58fad63341904f8a77911785 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: cgroup: add tracepoints for basic operations Debugging what goes wrong with cgroup setup can get hairy. Add tracepoints for cgroup hierarchy mount, cgroup creation/destruction and task migration operations for better visibility. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3861161e460f..5e2e81ad9175 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -64,6 +64,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + trace_cgroup_destroy_root(root); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); spin_unlock(&release_agent_path_lock); } + + trace_cgroup_remount(root); + out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; + trace_cgroup_setup_root(root); + /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in @@ -2825,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); + + if (!ret) + trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + return ret; } @@ -3587,6 +3601,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); mutex_unlock(&cgroup_mutex); @@ -4355,6 +4371,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); @@ -5020,6 +5038,8 @@ static void css_release_work_fn(struct work_struct *work) ss->css_released(css); } else { /* cgroup release path */ + trace_cgroup_release(cgrp); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5306,6 +5326,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; + trace_cgroup_mkdir(cgrp); + /* let's create and online css's */ kernfs_activate(kn); @@ -5481,6 +5503,9 @@ static int cgroup_rmdir(struct kernfs_node *kn) ret = cgroup_destroy_locked(cgrp); + if (!ret) + trace_cgroup_rmdir(cgrp); + cgroup_kn_unlock(kn); return ret; } -- cgit v1.2.3 From 679a5e3f12830392d14f94b04bbe0f3cabdbd773 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Sep 2016 11:58:36 +0200 Subject: cpuset: fix error handling regression in proc_cpuset_show() 4c737b41de7f ("cgroup: make cgroup_path() and friends behave in the style of strlcpy()") botched the conversion of proc_cpuset_show() and broke its error handling. It made the function return 0 on failures and fail to handle error returns from cgroup_path_ns(). Fix it. Reported-by: Dan Carpenter Signed-off-by: Tejun Heo --- kernel/cpuset.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 793ae6fd96dc..97dd8e178786 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2698,12 +2698,13 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, current->nsproxy->cgroup_ns); css_put(css); if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) goto out_free; seq_puts(m, buf); seq_putc(m, '\n'); @@ -2711,7 +2712,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, out_free: kfree(buf); out: - return 0; + return retval; } #endif /* CONFIG_PROC_PID_CPUSET */ -- cgit v1.2.3 From e0223003e6e141533446d01a92784592a97a8552 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Sep 2016 15:49:40 +0200 Subject: cgroup: fix error handling regressions in proc_cgroup_show() and cgroup_release_agent() 4c737b41de7f ("cgroup: make cgroup_path() and friends behave in the style of strlcpy()") broke error handling in proc_cgroup_show() and cgroup_release_agent() by not handling negative return values from cgroup_path_ns_locked(). Fix it. Reported-by: Dan Carpenter Signed-off-by: Tejun Heo --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5e2e81ad9175..a7f9fb4e1fc7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5781,10 +5781,10 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (retval >= PATH_MAX) { + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; + if (retval < 0) goto out_unlock; - } seq_puts(m, buf); } else { @@ -6069,7 +6069,7 @@ static void cgroup_release_agent(struct work_struct *work) spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (ret >= PATH_MAX) + if (ret < 0 || ret >= PATH_MAX) goto out; argv[0] = agentbuf; -- cgit v1.2.3 From 38addce8b600ca335dc86fa3d48c890f1c6fa1f4 Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 20 Jun 2016 20:41:19 +0200 Subject: gcc-plugins: Add latent_entropy plugin This adds a new gcc plugin named "latent_entropy". It is designed to extract as much possible uncertainty from a running system at boot time as possible, hoping to capitalize on any possible variation in CPU operation (due to runtime data differences, hardware differences, SMP ordering, thermal timing variation, cache behavior, etc). At the very least, this plugin is a much more comprehensive example for how to manipulate kernel code using the gcc plugin internals. The need for very-early boot entropy tends to be very architecture or system design specific, so this plugin is more suited for those sorts of special cases. The existing kernel RNG already attempts to extract entropy from reliable runtime variation, but this plugin takes the idea to a logical extreme by permuting a global variable based on any variation in code execution (e.g. a different value (and permutation function) is used to permute the global based on loop count, case statement, if/then/else branching, etc). To do this, the plugin starts by inserting a local variable in every marked function. The plugin then adds logic so that the value of this variable is modified by randomly chosen operations (add, xor and rol) and random values (gcc generates separate static values for each location at compile time and also injects the stack pointer at runtime). The resulting value depends on the control flow path (e.g., loops and branches taken). Before the function returns, the plugin mixes this local variable into the latent_entropy global variable. The value of this global variable is added to the kernel entropy pool in do_one_initcall() and _do_fork(), though it does not credit any bytes of entropy to the pool; the contents of the global are just used to mix the pool. Additionally, the plugin can pre-initialize arrays with build-time random contents, so that two different kernel builds running on identical hardware will not have the same starting values. Signed-off-by: Emese Revfy [kees: expanded commit message and code comments] Signed-off-by: Kees Cook --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..001b18473a07 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1780,6 +1780,7 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + add_latent_entropy(); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. -- cgit v1.2.3 From 0766f788eb727e2e330d55d30545db65bcf2623f Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 20 Jun 2016 20:42:34 +0200 Subject: latent_entropy: Mark functions with __latent_entropy The __latent_entropy gcc attribute can be used only on functions and variables. If it is on a function then the plugin will instrument it for gathering control-flow entropy. If the attribute is on a variable then the plugin will initialize it with random contents. The variable must be an integer, an integer array type or a structure with integer fields. These specific functions have been selected because they are init functions (to help gather boot-time entropy), are called at unpredictable times, or they have variable loops, each of which provide some level of latent entropy. Signed-off-by: Emese Revfy [kees: expanded commit message] Signed-off-by: Kees Cook --- kernel/fork.c | 6 ++++-- kernel/rcu/tiny.c | 2 +- kernel/rcu/tree.c | 2 +- kernel/sched/fair.c | 2 +- kernel/softirq.c | 4 ++-- kernel/time/timer.c | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 001b18473a07..05393881ef39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -404,7 +404,8 @@ free_tsk: } #ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; @@ -1296,7 +1297,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static struct task_struct *copy_process(unsigned long clone_flags, +static __latent_entropy struct task_struct *copy_process( + unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 944b1b491ed8..1898559e6b60 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) false)); } -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..e5164deb51e1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* * Do RCU core processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..004996df2f10 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8283,7 +8283,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ -static void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..34033fd09c8c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -482,7 +482,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule_first); -static void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -518,7 +518,7 @@ static void tasklet_action(struct softirq_action *a) } } -static void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 32bf6f75a8fe..2d47980a1bc4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1633,7 +1633,7 @@ static inline void __run_timers(struct timer_base *base) /* * This function runs timers and the timer-tq in bottom half context. */ -static void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); -- cgit v1.2.3 From 9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Oct 2016 08:04:03 +0800 Subject: sched/fair: Fix sched domains NULL dereference in select_idle_sibling() Commit: 10e2f1acd01 ("sched/core: Rewrite and improve select_idle_siblings()") ... improved select_idle_sibling(), but also triggered a regression (crash) during CPU-hotplug: BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 IP: [] select_idle_sibling+0x1c2/0x4f0 Call Trace: select_task_rq_fair+0x749/0x930 ? select_task_rq_fair+0xb4/0x930 ? __lock_is_held+0x54/0x70 try_to_wake_up+0x19a/0x5b0 default_wake_function+0x12/0x20 autoremove_wake_function+0x12/0x40 __wake_up_common+0x55/0x90 __wake_up+0x39/0x50 wake_up_klogd_work_func+0x40/0x60 irq_work_run_list+0x57/0x80 irq_work_run+0x2c/0x30 smp_irq_work_interrupt+0x2e/0x40 irq_work_interrupt+0x96/0xa0 ? _raw_spin_unlock_irqrestore+0x45/0x80 try_to_wake_up+0x4a/0x5b0 wake_up_state+0x10/0x20 __kthread_unpark+0x67/0x70 kthread_unpark+0x22/0x30 cpuhp_online_idle+0x3e/0x70 cpu_startup_entry+0x6a/0x450 start_secondary+0x154/0x180 This can be reproduced by running the ftrace test case of kselftest, the test case will hot-unplug the CPU and the CPU will attach to the NULL sched-domain during scheduler teardown. The step 2 for the rewrite select_idle_siblings(): | Step 2) tracks the average cost of the scan and compares this to the | average idle time guestimate for the CPU doing the wakeup. If the CPU which doing the wakeup is the going hot-unplug CPU, then NULL sched domain will be dereferenced to acquire the average cost of the scan. This patch fix it by failing the search of an idle CPU in the LLC process if this sched domain is NULL. Tested-by: Catalin Marinas Signed-off-by: Wanpeng Li Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1475971443-3187-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 502e95a6e927..8b03fb5d1b9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5471,13 +5471,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { - struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - u64 avg_idle = this_rq()->avg_idle; - u64 avg_cost = this_sd->avg_scan_cost; + struct sched_domain *this_sd; + u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; int cpu, wrap; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + + avg_cost = this_sd->avg_scan_cost; + /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. -- cgit v1.2.3 From a705e07b9c80df27b6bb12f7a4cd4cf4ed2f728b Mon Sep 17 00:00:00 2001 From: Joonas Lahtinen Date: Wed, 12 Oct 2016 13:18:56 +0300 Subject: cpu/hotplug: Use distinct name for cpu_hotplug.dep_map Use distinctive name for cpu_hotplug.dep_map to avoid the actual cpu_hotplug.lock appearing as cpu_hotplug.lock#2 in lockdep splats. Signed-off-by: Joonas Lahtinen Reviewed-by: Chris Wilson Acked-by: Gautham R. Shenoy Cc: Andrew Morton Cc: Daniel Vetter Cc: Gautham R . Shenoy Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: trivial@kernel.org Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5df20d6d1520..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,7 +228,7 @@ static struct { .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), #ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "cpu_hotplug.lock" }, + .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), #endif }; -- cgit v1.2.3 From 54e23845e965898f65f76aba79fa9db76d830fa9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 17 Oct 2016 11:47:02 +0200 Subject: alarmtimer: Remove unused but set variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the set but unused variable base in alarm_clock_get to fix the following warning when building with 'W=1': kernel/time/alarmtimer.c: In function ‘alarm_timer_create’: kernel/time/alarmtimer.c:545:21: warning: variable ‘base’ set but not used [-Wunused-but-set-variable] Signed-off-by: Tobias Klauser Cc: John Stultz Link: http://lkml.kernel.org/r/20161017094702.10873-1-tklauser@distanz.ch Signed-off-by: Thomas Gleixner --- kernel/time/alarmtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) static int alarm_timer_create(struct k_itimer *new_timer) { enum alarmtimer_type type; - struct alarm_base *base; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return -EPERM; type = clock2alarm(new_timer->it_clock); - base = &alarm_bases[type]; alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } -- cgit v1.2.3 From b5a9b340789b2b24c6896bcf7a065c31a4db671c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 19 Oct 2016 14:45:23 +0200 Subject: sched/fair: Fix incorrect task group ->load_avg A scheduler performance regression has been reported by Joseph Salisbury, which he bisected back to: 3d30544f0212 ("sched/fair: Apply more PELT fixes) The regression triggers when several levels of task groups are involved (read: SystemD) and cpu_possible_mask != cpu_present_mask. The root cause is that group entity's load (tg_child->se[i]->avg.load_avg) is initialized to scale_load_down(se->load.weight). During the creation of a child task group, its group entities on possible CPUs are attached to parent's cfs_rq (tg_parent) and their loads are added to the parent's load (tg_parent->load_avg) with update_tg_load_avg(). But only the load on online CPUs will then be updated to reflect real load, whereas load on other CPUs will stay at the initial value. The result is a tg_parent->load_avg that is higher than the real load, the weight of group entities (tg_parent->se[i]->load.weight) on online CPUs is smaller than it should be, and the task group gets a less running time than what it could expect. ( This situation can be detected with /proc/sched_debug. The ".tg_load_avg" of the task group will be much higher than sum of ".tg_load_avg_contrib" of online cfs_rqs of the task group. ) The load of group entities don't have to be intialized to something else than 0 because their load will increase when an entity is attached. Reported-by: Joseph Salisbury Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: # 4.8.x Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: joonwoop@codeaurora.org Fixes: 3d30544f0212 ("sched/fair: Apply more PELT fixes) Link: http://lkml.kernel.org/r/1476881123-10159-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76ee7de1859d..d941c97dfbc3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway -- cgit v1.2.3 From 9beae1ea89305a9667ceaab6d0bf46a045ad71e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:17 +0100 Subject: mm: replace get_user_pages_remote() write/force parameters with gup_flags This removes the 'write' and 'force' from get_user_pages_remote() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; -- cgit v1.2.3 From f307ab6dcea03f9d8e4d70508fd7d1ca57cfa7f9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:20 +0100 Subject: mm: replace access_process_vm() write parameter with gup_flags This removes the 'write' argument from access_process_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Jesper Nilsson Acked-by: Michal Hocko Acked-by: Michael Ellerman Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; -- cgit v1.2.3 From 8835ca59dac2bc1e0136791abf3ccd51588803ce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 19 Oct 2016 09:11:24 -0700 Subject: printk: suppress empty continuation lines We have a fairly common pattern where you print several things as continuations on one single line in a loop, and then at the end you do printk(KERN_CONT "\n"); to flush the buffered output. But if the output was flushed by something else (concurrent printk activity, or just system logging), we don't want that final flushing to just print an empty line. So just suppress empty continuation lines when they couldn't be merged into the line they are a continuation of. Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5e397315473..de08fc90baaf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c cont_flush(); } + /* Skip empty continuation lines that couldn't be added - they just flush */ + if (!text_len && (lflags & LOG_CONT)) + return 0; + /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { if (cont_add(facility, level, lflags, text, text_len)) -- cgit v1.2.3 From 3118dac501bc0317de099db81618d589503351e1 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 6 Oct 2016 23:06:43 +0530 Subject: kernel/irq: Export irq_set_parent() The TPS65217 driver grew interrupt support which uses irq_set_parent(). While it's not yet clear why this is used in the first place, building the driver as a module fails with: ERROR: ".irq_set_parent" [drivers/mfd/tps65217.ko] undefined! The correctness of the driver change is still investigated, but for now it's less trouble to export irq_set_parent() than dealing with the build wreckage. [ tglx: Rewrote changelog and made the export GPL ] Fixes: 6556bdacf646 ("mfd: tps65217: Add support for IRQs") Signed-off-by: Sudip Mukherjee Cc: Sudip Mukherjee Cc: Marcin Niestroj Cc: Grygorii Strashko Cc: Tony Lindgren Cc: Lee Jones Link: http://lkml.kernel.org/r/1475775403-27207-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0c5f1a5db654..9c4d30483264 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq) irq_put_desc_unlock(desc, flags); return 0; } +EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* -- cgit v1.2.3 From f660f6066716b700148f60dba3461e65efff3123 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Oct 2016 15:10:51 +0300 Subject: softirq: Display IRQ_POLL for irq-poll statistics This library was moved to the generic area and was renamed to irq-poll. Hence, update proc/softirqs output accordingly. Signed-off-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 1bf81ef91375..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); const char * const softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; -- cgit v1.2.3 From 1adb469b9b76276d7e5ea36a20a24c47d6618a0b Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Fri, 21 Oct 2016 16:24:09 +0100 Subject: PM / suspend: Fix missing KERN_CONT for suspend message Commit 4bcc595ccd80 (printk: reinstate KERN_CONT for printing continuation lines) exposed a missing KERN_CONT from one of the messages shown on entering suspend. With v4.9-rc1, the 'done.' shown after syncing the filesystems no longer appears as a continuation but a new message with its own timestamp. [ 9.259566] PM: Syncing filesystems ... [ 9.264119] done. Fix this by adding the KERN_CONT log level for the 'done.' part of the message seen after syncing filesystems. While we are at it, convert these suspend printks to pr_info and pr_cont, respectively. Signed-off-by: Jon Hunter Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1e7f5da648d9..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("PM: Syncing filesystems ... "); sys_sync(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif -- cgit v1.2.3 From b831275a3553c32091222ac619cfddd73a5553fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Oct 2016 11:41:56 +0200 Subject: timers: Plug locking race vs. timer migration Linus noticed that lock_timer_base() lacks a READ_ONCE() for accessing the timer flags. As a consequence the compiler is allowed to reload the flags between the initial check for TIMER_MIGRATION and the following timer base computation and the spin lock of the base. While this has not been observed (yet), we need to make sure that it never happens. Fixes: 0eeda71bc30d ("timer: Replace timer base by a cpu index") Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Peter Zijlstra --- kernel/time/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d47980a1bc4..0d4b91c5a374 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -943,7 +943,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, { for (;;) { struct timer_base *base; - u32 tf = timer->flags; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); -- cgit v1.2.3 From 4da9152a4308dcbf611cde399c695c359fc9145f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Oct 2016 11:55:10 +0200 Subject: timers: Lock base for same bucket optimization Linus stumbled over the unlocked modification of the timer expiry value in mod_timer() which is an optimization for timers which stay in the same bucket - due to the bucket granularity - despite their expiry time getting updated. The optimization itself still makes sense even if we take the lock, because in case that the bucket stays the same, we avoid the pointless queue/enqueue dance. Make the check and the modification of timer->expires protected by the base lock and shuffle the remaining code around so we can keep the lock held when we actually have to requeue the timer to a different bucket. Fixes: f00c0afdfa62 ("timers: Implement optimization for same expiry time in mod_timer()") Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Peter Zijlstra --- kernel/time/timer.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0d4b91c5a374..ccf913038f9f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -971,6 +971,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long clk = 0, flags; int ret = 0; + BUG_ON(!timer->function); + /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the @@ -979,13 +981,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* - * Take the current timer_jiffies of base, but without holding - * the lock! + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. */ - base = get_timer_base(timer->flags); - clk = base->clk; + base = lock_timer_base(timer, &flags); + clk = base->clk; idx = calc_wheel_index(expires, clk); /* @@ -995,14 +1000,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) */ if (idx == timer_get_idx(timer)) { timer->expires = expires; - return 1; + ret = 1; + goto out_unlock; } + } else { + base = lock_timer_base(timer, &flags); } timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, false); if (!ret && pending_only) @@ -1035,9 +1040,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance - * between calculating 'idx' and taking the lock, only enqueue_timer() - * and trigger_dyntick_cpu() is required. Otherwise we need to - * (re)calculate the wheel index via internal_add_timer(). + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise + * we need to (re)calculate the wheel index via + * internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); -- cgit v1.2.3 From 041ad7bc758db259bb960ef795197dd14aab19a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2016 11:07:35 +0000 Subject: timers: Prevent base clock rewind when forwarding clock Ashton and Michael reported, that kernel versions 4.8 and later suffer from USB timeouts which are caused by the timer wheel rework. This is caused by a bug in the base clock forwarding mechanism, which leads to timers expiring early. The scenario which leads to this is: run_timers() while (jiffies >= base->clk) { collect_expired_timers(); base->clk++; expire_timers(); } So base->clk = jiffies + 1. Now the cpu goes idle: idle() get_next_timer_interrupt() nextevt = __next_time_interrupt(); if (time_after(nextevt, base->clk)) base->clk = jiffies; jiffies has not advanced since run_timers(), so this assignment effectively decrements base->clk by one. base->clk is the index into the timer wheel arrays. So let's assume the following state after the base->clk increment in run_timers(): jiffies = 0 base->clk = 1 A timer gets enqueued with an expiry delta of 63 ticks (which is the case with the USB timeout and HZ=250) so the resulting bucket index is: base->clk + delta = 1 + 63 = 64 The timer goes into the first wheel level. The array size is 64 so it ends up in bucket 0, which is correct as it takes 63 ticks to advance base->clk to index into bucket 0 again. If the cpu goes idle before jiffies advance, then the bug in the forwarding mechanism sets base->clk back to 0, so the next invocation of run_timers() at the next tick will index into bucket 0 and therefore expire the timer 62 ticks too early. Instead of blindly setting base->clk to jiffies we must make the forwarding conditional on jiffies > base->clk, but we cannot use jiffies for this as we might run into the following issue: if (time_after(jiffies, base->clk) { if (time_after(nextevt, base->clk)) base->clk = jiffies; jiffies can increment between the check and the assigment far enough to advance beyond nextevt. So we need to use a stable value for checking. get_next_timer_interrupt() has the basej argument which is the jiffies value snapshot taken in the calling code. So we can just that. Thanks to Ashton for bisecting and providing trace data! Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") Reported-by: Ashton Holmes Reported-by: Michael Thayer Signed-off-by: Thomas Gleixner Cc: Michal Necasek Cc: Peter Zijlstra Cc: knut.osmundsen@oracle.com Cc: stable@vger.kernel.org Cc: stern@rowland.harvard.edu Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20161022110552.175308322@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ccf913038f9f..7c446fb5163a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1523,12 +1523,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* - * We have a fresh next event. Check whether we can forward the base: + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. */ - if (time_after(nextevt, jiffies)) - base->clk = jiffies; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } if (time_before_eq(nextevt, basej)) { expires = basem; -- cgit v1.2.3 From 6bad6bccf2d717f652d37e63cf261eaa23466009 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2016 11:07:37 +0000 Subject: timers: Prevent base clock corruption when forwarding When a timer is enqueued we try to forward the timer base clock. This mechanism has two issues: 1) Forwarding a remote base unlocked The forwarding function is called from get_target_base() with the current timer base lock held. But if the new target base is a different base than the current base (can happen with NOHZ, sigh!) then the forwarding is done on an unlocked base. This can lead to corruption of base->clk. Solution is simple: Invoke the forwarding after the target base is locked. 2) Possible corruption due to jiffies advancing This is similar to the issue in get_net_timer_interrupt() which was fixed in the previous patch. jiffies can advance between check and assignement and therefore advancing base->clk beyond the next expiry value. So we need to read jiffies into a local variable once and do the checks and assignment with the local copy. Fixes: a683f390b93f("timers: Forward the wheel clock whenever possible") Reported-by: Ashton Holmes Reported-by: Michael Thayer Signed-off-by: Thomas Gleixner Cc: Michal Necasek Cc: Peter Zijlstra Cc: knut.osmundsen@oracle.com Cc: stable@vger.kernel.org Cc: stern@rowland.harvard.edu Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20161022110552.253640125@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 7c446fb5163a..c611c47de884 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags) #ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { #ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) @@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { + unsigned long jnow = READ_ONCE(jiffies); + /* * We only forward the base when it's idle and we have a delta between * base clock and jiffies. */ - if (!base->is_idle || (long) (jiffies - base->clk) < 2) + if (!base->is_idle || (long) (jnow - base->clk) < 2) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jiffies)) - base->clk = jiffies; + if (time_after(base->next_expiry, jnow)) + base->clk = jnow; else base->clk = base->next_expiry; } #else static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { return get_timer_this_cpu_base(tflags); } @@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { } #endif -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - struct timer_base *target = __get_target_base(base, tflags); - - forward_timer_base(target); - return target; -} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -1037,6 +1031,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } + /* Try to forward a stale timer base clock */ + forward_timer_base(base); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance -- cgit v1.2.3 From f5d6d2da0d9098a4aa0ebcc187aa0fc167045d6b Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 26 Oct 2016 13:37:04 +0200 Subject: sched/fair: Remove unused but set variable 'rq' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit: 8663e24d56dc ("sched/fair: Reorder cgroup creation code") ... the variable 'rq' in alloc_fair_sched_group() is set but no longer used. Remove it to fix the following GCC warning when building with 'W=1': kernel/sched/fair.c:8842:13: warning: variable ‘rq’ set but not used [-Wunused-but-set-variable] Signed-off-by: Tobias Klauser Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026113704.8981-1-tklauser@distanz.ch Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d941c97dfbc3..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8839,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8854,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) -- cgit v1.2.3 From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 26 Oct 2016 10:15:30 -0700 Subject: mm: remove per-zone hashtable of bitlock waitqueues The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher Tested-by: Bob Peterson Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Steven Whitehouse Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 16 ++++++++++++++++ kernel/sched/wait.c | 10 ---------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..42d4027f9e26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash -- cgit v1.2.3 From b274c0bb394c6a69ac12feac7c2db81f5aff5a55 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Oct 2016 17:46:21 -0700 Subject: kcov: properly check if we are in an interrupt in_interrupt() returns a nonzero value when we are either in an interrupt or have bh disabled via local_bh_disable(). Since we are interested in only ignoring coverage from actual interrupts, do a proper check instead of just calling in_interrupt(). As a result of this change, kcov will start to collect coverage from within local_bh_disable()/local_bh_enable() sections. Link: http://lkml.kernel.org/r/1476115803-20712-1-git-send-email-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Nicolai Stange Cc: Andrey Ryabinin Cc: Kees Cook Cc: James Morse Cc: Vegard Nossum Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..30e6d05aa5a9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. + * The checks for whether we are in an interrupt are open-coded, because + * 1. We can't use in_interrupt() here, since it also returns true + * when we are inside local_bh_disable() section. + * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), + * since that leads to slower generated code (three separate tests, + * one for each of the flags). */ - if (!t || in_interrupt()) + if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET + | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { -- cgit v1.2.3 From 0933840acf7b65d6d30a5b6089d882afea57aca3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 20 Oct 2016 13:10:11 +0200 Subject: perf/core: Protect PMU device removal with a 'pmu_bus_running' check, to fix CONFIG_DEBUG_TEST_DRIVER_REMOVE=y kernel panic CAI Qian reported a crash in the PMU uncore device removal code, enabled by the CONFIG_DEBUG_TEST_DRIVER_REMOVE=y option: https://marc.info/?l=linux-kernel&m=147688837328451 The reason for the crash is that perf_pmu_unregister() tries to remove a PMU device which is not added at this point. We add PMU devices only after pmu_bus is registered, which happens in the perf_event_sysfs_init() call and sets the 'pmu_bus_running' flag. The fix is to get the 'pmu_bus_running' flag state at the point the PMU is taken out of the PMU list and remove the device later only if it's set. Reported-by: CAI Qian Tested-by: CAI Qian Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Greg Kroah-Hartman Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rob Herring Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161020111011.GA13361@krava Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c6e47e97b33f..a5d2e62faf7e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8855,7 +8855,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { + int remove_device; + mutex_lock(&pmus_lock); + remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); @@ -8869,10 +8872,12 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (remove_device) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } free_pmu_context(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); -- cgit v1.2.3 From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Oct 2016 11:48:24 +0200 Subject: perf/powerpc: Don't call perf_event_disable() from atomic context The trinity syscall fuzzer triggered following WARN() on powerpc: WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278 ... NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0 LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 Call Trace: [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable) [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 Followed by a lockdep warning: =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc5+ #7 Tainted: G W ------------------------------- ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ls/2998: #0: (rcu_read_lock){......}, at: [] .__atomic_notifier_call_chain+0x0/0x1c0 #1: (rcu_read_lock){......}, at: [] .hw_breakpoint_handler+0x0/0x2b0 stack backtrace: CPU: 9 PID: 2998 Comm: ls Tainted: G W 4.8.0-rc5+ #7 Call Trace: [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable) [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180 [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0 [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0 [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380 [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60 [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0 [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 While it looks like the first WARN() is probably valid, the other one is triggered by disabling event via perf_event_disable() from atomic context. The event is disabled here in case we were not able to emulate the instruction that hit the breakpoint. By disabling the event we unschedule the event and make sure it's not scheduled back. But we can't call perf_event_disable() from atomic context, instead we need to use the event's pending_disable irq_work method to disable it. Reported-by: Jan Stancek Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Huang Ying Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Neuling Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a5d2e62faf7e..0e292132efac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); -- cgit v1.2.3