From d86ab9cff8b936aadde444d0e263a8db5ff0349b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 3 May 2017 14:30:48 +0100 Subject: cpufreq: schedutil: use now as reference when aggregating shared policy requests Currently, sugov_next_freq_shared() uses last_freq_update_time as a reference to decide when to start considering CPU contributions as stale. However, since last_freq_update_time is set by the last CPU that issued a frequency transition, this might cause problems in certain cases. In practice, the detection of stale utilization values fails whenever the CPU with such values was the last to update the policy. For example (and please note again that the SCHED_CPUFREQ_RT flag is not the problem here, but only the detection of after how much time that flag has to be considered stale), suppose a policy with 2 CPUs: CPU0 | CPU1 | | RT task scheduled | SCHED_CPUFREQ_RT is set | CPU1->last_update = now | freq transition to max | last_freq_update_time = now | more than TICK_NSEC nsecs | a small CFS wakes up | CPU0->last_update = now1 | delta_ns(CPU0) < TICK_NSEC* | CPU0's util is considered | delta_ns(CPU1) = | last_freq_update_time - | CPU1->last_update = 0 | < TICK_NSEC | CPU1 is still considered | CPU1->SCHED_CPUFREQ_RT is set | we stay at max (until CPU1 | exits from idle) | * delta_ns is actually negative as now1 > last_freq_update_time While last_freq_update_time is a sensible reference for rate limiting, it doesn't seem to be useful for working around stale CPU states. Fix the problem by always considering now (time) as the reference for deciding when CPUs have stale contributions. Signed-off-by: Juri Lelli Acked-by: Vincent Guittot Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..622eed1b7658 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -245,11 +245,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned long util = 0, max = 1; unsigned int j; @@ -265,7 +264,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) * enough, don't take the CPU into account as it probably is * idle now (and clear iowait_boost for it). */ - delta_ns = last_freq_update_time - j_sg_cpu->last_update; + delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; continue; @@ -309,7 +308,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT_DL) next_f = sg_policy->policy->cpuinfo.max_freq; else - next_f = sugov_next_freq_shared(sg_cpu); + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } -- cgit v1.2.3 From 0bae5fd3330be0517fba697e6b228601d421fade Mon Sep 17 00:00:00 2001 From: Pushkar Jambhlekar Date: Thu, 11 May 2017 10:31:24 +0530 Subject: PM / hibernate: Declare variables as static Fixing sparse warnings: 'symbol not declared. Should it be static?' Signed-off-by: Pushkar Jambhlekar Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d79a38de425a..a628cccafa4a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1422,7 +1422,7 @@ static unsigned int nr_meta_pages; * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. */ -unsigned int alloc_normal, alloc_highmem; +static unsigned int alloc_normal, alloc_highmem; /* * Memory bitmap used for marking saveable pages (during hibernation) or * hibernation image pages (during restore) -- cgit v1.2.3 From 625ed2bf049d5a352c1bcca962d6e133454eaaff Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 26 Apr 2017 08:27:56 +0200 Subject: sched/cfs: Make util/load_avg more stable In the current implementation of load/util_avg, we assume that the ongoing time segment has fully elapsed, and util/load_sum is divided by LOAD_AVG_MAX, even if part of the time segment still remains to run. As a consequence, this remaining part is considered as idle time and generates unexpected variations of util_avg of a busy CPU in the range [1002..1024[ whereas util_avg should stay at 1023. In order to keep the metric stable, we should not consider the ongoing time segment when computing load/util_avg but only the segments that have already fully elapsed. But to not consider the current time segment adds unwanted latency in the load/util_avg responsivness especially when the time is scaled instead of the contribution. Instead of waiting for the current time segment to have fully elapsed before accounting it in load/util_avg, we can already account the elapsed part but change the range used to compute load/util_avg accordingly. At the very beginning of a new time segment, the past segments have been decayed and the max value is LOAD_AVG_MAX*y. At the very end of the current time segment, the max value becomes: LOAD_AVG_MAX*y + 1024(us) (== LOAD_AVG_MAX) In fact, the max value is: LOAD_AVG_MAX*y + sa->period_contrib at any time in the time segment. Taking advantage of the fact that: LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024 the range becomes [0..LOAD_AVG_MAX-1024+sa->period_contrib]. As the elapsed part is already accounted in load/util_sum, we update the max value according to the current position in the time segment instead of removing its contribution. Suggested-by: Peter Zijlstra Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1493188076-2767-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d71109321841..4f1825d60937 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2916,12 +2916,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); if (cfs_rq) { cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); return 1; } -- cgit v1.2.3 From cf15ca8deda86b27b66e27848b4b0fe58098fc0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:11:53 +0200 Subject: sched/clock: Initialize all per-CPU state before switching (back) to unstable In preparation for not keeping the sched_clock_tick() active for stable TSC, we need to explicitly initialize all per-CPU state before switching back to unstable. Note: this patch looses the __gtod_offset calculation; it will be restored in the next one. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 60 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 00a45c45beca..dc650851935f 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -124,6 +124,12 @@ int sched_clock_stable(void) return static_branch_likely(&__sched_clock_stable); } +static void __scd_stamp(struct sched_clock_data *scd) +{ + scd->tick_gtod = ktime_get_ns(); + scd->tick_raw = sched_clock(); +} + static void __set_sched_clock_stable(void) { struct sched_clock_data *scd = this_scd(); @@ -141,8 +147,37 @@ static void __set_sched_clock_stable(void) tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } +/* + * If we ever get here, we're screwed, because we found out -- typically after + * the fact -- that TSC wasn't good. This means all our clocksources (including + * ktime) could have reported wrong values. + * + * What we do here is an attempt to fix up and continue sort of where we left + * off in a coherent manner. + * + * The only way to fully avoid random clock jumps is to boot with: + * "tsc=unstable". + */ static void __sched_clock_work(struct work_struct *work) { + struct sched_clock_data *scd; + int cpu; + + /* take a current timestamp and set 'now' */ + preempt_disable(); + scd = this_scd(); + __scd_stamp(scd); + scd->clock = scd->tick_gtod + __gtod_offset; + preempt_enable(); + + /* clone to all CPUs */ + for_each_possible_cpu(cpu) + per_cpu(sched_clock_data, cpu) = *scd; + + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + static_branch_disable(&__sched_clock_stable); } @@ -150,27 +185,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work); static void __clear_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); - - /* - * Attempt to make the stable->unstable transition continuous. - * - * Trouble is, this is typically called from the TSC watchdog - * timer, which is late per definition. This means the tick - * values can already be screwy. - * - * Still do what we can. - */ - __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); - - printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", - scd->tick_gtod, __gtod_offset, - scd->tick_raw, __sched_clock_offset); + if (!sched_clock_stable()) + return; tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); - - if (sched_clock_stable()) - schedule_work(&sched_clock_work); + schedule_work(&sched_clock_work); } void clear_sched_clock_stable(void) @@ -357,8 +376,7 @@ void sched_clock_tick(void) * XXX arguably we can skip this if we expose tsc_clocksource_reliable */ scd = this_scd(); - scd->tick_raw = sched_clock(); - scd->tick_gtod = ktime_get_ns(); + __scd_stamp(scd); if (!sched_clock_stable() && likely(sched_clock_running)) sched_clock_local(scd); -- cgit v1.2.3 From b421b22b00b0011f6a2ce3561176c4e79e640c49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:14:13 +0200 Subject: x86/tsc, sched/clock, clocksource: Use clocksource watchdog to provide stable sync points Currently we keep sched_clock_tick() active for stable TSC in order to keep the per-CPU state semi up-to-date. The (obvious) problem is that by the time we detect TSC is borked, our per-CPU state is also borked. So hook into the clocksource watchdog and call a method after we've found it to still be stable. There's the obvious race where the TSC goes wonky between finding it stable and us running the callback, but closing that is too much work and not really worth it, since we're already detecting TSC wobbles after the fact, so we cannot, per definition, fully avoid funny clock values. And since the watchdog runs less often than the tick, this is also an optimization. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 36 +++++++++++++++++++++++++++--------- kernel/time/clocksource.c | 3 +++ 2 files changed, 30 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index dc650851935f..f861637f7fdc 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -366,20 +366,38 @@ void sched_clock_tick(void) { struct sched_clock_data *scd; + if (sched_clock_stable()) + return; + + if (unlikely(!sched_clock_running)) + return; + WARN_ON_ONCE(!irqs_disabled()); - /* - * Update these values even if sched_clock_stable(), because it can - * become unstable at any point in time at which point we need some - * values to fall back on. - * - * XXX arguably we can skip this if we expose tsc_clocksource_reliable - */ scd = this_scd(); __scd_stamp(scd); + sched_clock_local(scd); +} + +void sched_clock_tick_stable(void) +{ + u64 gtod, clock; - if (!sched_clock_stable() && likely(sched_clock_running)) - sched_clock_local(scd); + if (!sched_clock_stable()) + return; + + /* + * Called under watchdog_lock. + * + * The watchdog just found this TSC to (still) be stable, so now is a + * good moment to update our __gtod_offset. Because once we find the + * TSC to be unstable, any computation will be computing crap. + */ + local_irq_disable(); + gtod = ktime_get_ns(); + clock = sched_clock(); + __gtod_offset = (clock + __sched_clock_offset) - gtod; + local_irq_enable(); } /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 93621ae718d3..03918a19cf2d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data) continue; } + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { -- cgit v1.2.3 From ac1e843f0900bea92fcb47f6205e1f9ffb0d469c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:26:23 +0200 Subject: sched/clock: Remove unused argument to sched_clock_idle_wakeup_event() The argument to sched_clock_idle_wakeup_event() has not been used in a long time. Remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 4 ++-- kernel/time/tick-sched.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index f861637f7fdc..750a92c9db7e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -410,9 +410,9 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled delta nanoseconds (called with irqs disabled): + * We just idled; resync with ktime. (called with irqs disabled): */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +void sched_clock_idle_wakeup_event(void) { if (timekeeping_suspended) return; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..9c2dc64e31d8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -554,7 +554,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) -- cgit v1.2.3 From 3067a33d5fec856bb297d58e7f03411d060ccdee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:42:03 +0200 Subject: sched/clock: Remove watchdog touching Commit: 2bacec8c318c ("sched: touch softlockup watchdog after idling") introduced the touch_softlockup_watchdog_sched() call without justification and I feel sched_clock management is not the right place, it should only be concerned with producing semi coherent time. If this causes watchdog thingies, we can find a better place. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 750a92c9db7e..c30c05f05d6f 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -418,7 +418,6 @@ void sched_clock_idle_wakeup_event(void) return; sched_clock_tick(); - touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -- cgit v1.2.3 From f9fccdb9efef60dbcf84d493514b475c41aa866f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:43:59 +0200 Subject: cpuidle: Fix idle time tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ville reported that on his Core2, which has TSC stop in idle, we would always report very short idle durations. He tracked this down to commit: e93e59ce5b85 ("cpuidle: Replace ktime_get() with local_clock()") which replaces ktime_get() with local_clock(). Add a sched_clock_idle_wakeup_event() call, which will re-sync the clock with ktime_get_ns() when TSC is unstable and no-op otherwise. Reported-by: Ville Syrjälä Tested-by: Ville Syrjälä Signed-off-by: Peter Zijlstra (Intel) Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: e93e59ce5b85 ("cpuidle: Replace ktime_get() with local_clock()") Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c30c05f05d6f..d4c2f89fac92 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -410,14 +410,21 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled; resync with ktime. (called with irqs disabled): + * We just idled; resync with ktime. */ void sched_clock_idle_wakeup_event(void) { - if (timekeeping_suspended) + unsigned long flags; + + if (sched_clock_stable()) + return; + + if (unlikely(timekeeping_suspended)) return; + local_irq_save(flags); sched_clock_tick(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -- cgit v1.2.3 From 2e44b7ddf8ab01cf98106c68388f87af15fbde73 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:46:57 +0200 Subject: sched/clock: Use late_initcall() instead of sched_init_smp() Core2 marks its TSC unstable in ACPI Processor Idle, which is probed after sched_init_smp(). Luckily it appears both acpi_processor and intel_idle (which has a similar check) are mandatory built-in. This means we can delay switching to stable until after these drivers have ran (if they were modules, this would be impossible). Delay the stable switch to late_initcall() to allow these drivers to mark TSC unstable and avoid difficult stable->unstable transitions. Reported-by: Lofstedt, Marta Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 10 +++++++++- kernel/sched/core.c | 2 -- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index d4c2f89fac92..a2f847c6ada8 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -64,6 +64,7 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -202,7 +203,11 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -void sched_clock_init_late(void) +/* + * We run this as late_initcall() such that it runs after all built-in drivers, + * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. + */ +static int __init sched_clock_init_late(void) { sched_clock_running = 2; /* @@ -216,7 +221,10 @@ void sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + + return 0; } +late_initcall(sched_clock_init_late); /* * min, max except they take wrapping into account diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 803c3bc274c4..5794f4acad15 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5984,7 +5984,6 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); - sched_clock_init_late(); sched_smp_initialized = true; } @@ -6000,7 +5999,6 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); - sched_clock_init_late(); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 7708d5f04de4dd5d2110df3244372b1e3f61bc7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:52:52 +0200 Subject: sched/clock: Print a warning recommending 'tsc=unstable' With our switch to stable delayed until late_initcall(), the most likely cause of hitting mark_tsc_unstable() is the watchdog. The watchdog typically only triggers when creative BIOS'es fiddle with the TSC to hide SMI latency. Since the watchdog can only detect TSC fiddling after the fact all TSC clocks (including userspace GTOD) can already have reported funny values. The only way to fully avoid this, is manually marking the TSC unstable at boot. Suggest people do this on their broken systems. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a2f847c6ada8..1a0d389d2f2b 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -175,6 +175,7 @@ static void __sched_clock_work(struct work_struct *work) for_each_possible_cpu(cpu) per_cpu(sched_clock_data, cpu) = *scd; + printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", scd->tick_gtod, __gtod_offset, scd->tick_raw, __sched_clock_offset); -- cgit v1.2.3 From 8c0334697dc37eb3d6d7632304d3a3662248daac Mon Sep 17 00:00:00 2001 From: Lauro Ramos Venancio Date: Thu, 13 Apr 2017 10:56:07 -0300 Subject: sched/topology: Refactor function build_overlap_sched_groups() Create functions build_group_from_child_sched_domain() and init_overlap_sched_group(). No functional change. Signed-off-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1492091769-19879-2-git-send-email-lvenanci@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 62 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1b0b4fb12837..d786d45c44d9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -513,6 +513,47 @@ int group_balance_cpu(struct sched_group *sg) return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); } +static struct sched_group * +build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) +{ + struct sched_group *sg; + struct cpumask *sg_span; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(cpu)); + + if (!sg) + return NULL; + + sg_span = sched_group_cpus(sg); + if (sd->child) + cpumask_copy(sg_span, sched_domain_span(sd->child)); + else + cpumask_copy(sg_span, sched_domain_span(sd)); + + return sg; +} + +static void init_overlap_sched_group(struct sched_domain *sd, + struct sched_group *sg, int cpu) +{ + struct sd_data *sdd = sd->private; + struct cpumask *sg_span; + + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + if (atomic_inc_return(&sg->sgc->ref) == 1) + build_group_mask(sd, sg); + + /* + * Initialize sgc->capacity such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg_span = sched_group_cpus(sg); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -537,31 +578,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; sg_span = sched_group_cpus(sg); - if (sibling->child) - cpumask_copy(sg_span, sched_domain_span(sibling->child)); - else - cpumask_set_cpu(i, sg_span); - cpumask_or(covered, covered, sg_span); - sg->sgc = *per_cpu_ptr(sdd->sgc, i); - if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); - - /* - * Initialize sgc->capacity such that even if we mess up the - * domains and no possible iteration will get us here, we won't - * die on a /0 trap. - */ - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + init_overlap_sched_group(sd, sg, i); /* * Make sure the first group of this domain contains the -- cgit v1.2.3 From c743f0a5c50f2fcbc628526279cfa24f3dabe182 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 14:20:05 +0200 Subject: sched/fair, cpumask: Export for_each_cpu_wrap() More users for for_each_cpu_wrap() have appeared. Promote the construct to generic cpumask interface. The implementation is slightly modified to reduce arguments. Signed-off-by: Peter Zijlstra (Intel) Cc: Lauro Ramos Venancio Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: lwang@redhat.com Link: http://lkml.kernel.org/r/20170414122005.o35me2h5nowqkxbv@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 45 ++++----------------------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4f1825d60937..f80c825e2b43 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5640,43 +5640,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -/* - * Implement a for_each_cpu() variant that starts the scan at a given cpu - * (@start), and wraps around. - * - * This is used to scan for idle CPUs; such that not all CPUs looking for an - * idle CPU find the same CPU. The down-side is that tasks tend to cycle - * through the LLC domain. - * - * Especially tbench is found sensitive to this. - */ - -static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) -{ - int next; - -again: - next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); - - if (*wrapped) { - if (next >= start) - return nr_cpumask_bits; - } else { - if (next >= nr_cpumask_bits) { - *wrapped = 1; - n = -1; - goto again; - } - } - - return next; -} - -#define for_each_cpu_wrap(cpu, mask, start, wrap) \ - for ((wrap) = 0, (cpu) = (start)-1; \ - (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ - (cpu) < nr_cpumask_bits; ) - #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5736,7 +5699,7 @@ unlock: static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu, wrap; + int core, cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5746,7 +5709,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); - for_each_cpu_wrap(core, cpus, target, wrap) { + for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -5812,7 +5775,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; - int cpu, wrap; + int cpu; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5829,7 +5792,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) -- cgit v1.2.3 From 0372dd2736e02672ac6e189c31f7d8c02ad543cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 17:24:02 +0200 Subject: sched/topology: Fix building of overlapping sched-groups When building the overlapping groups, we very obviously should start with the previous domain of _this_ @cpu, not CPU-0. This can be readily demonstrated with a topology like: node 0 1 2 3 0: 10 20 30 20 1: 20 10 20 30 2: 30 20 10 20 3: 20 30 20 10 Where (for example) CPU1 ends up generating the following nonsensical groups: [] CPU1 attaching sched-domain: [] domain 0: span 0-2 level NUMA [] groups: 1 2 0 [] domain 1: span 0-3 level NUMA [] groups: 1-3 (cpu_capacity = 3072) 0-1,3 (cpu_capacity = 3072) Where the fact that domain 1 doesn't include a group with span 0-2 is the obvious fail. With patch this looks like: [] CPU1 attaching sched-domain: [] domain 0: span 0-2 level NUMA [] groups: 1 0 2 [] domain 1: span 0-3 level NUMA [] groups: 0-2 (cpu_capacity = 3072) 0,2-3 (cpu_capacity = 3072) Debugged-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d786d45c44d9..921dedde2ee1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -566,7 +566,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct cpumask *sg_span; if (cpumask_test_cpu(i, covered)) -- cgit v1.2.3 From 91eaed0d61319f58a9f8e43d41a8cbb069b4f73d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 17:32:07 +0200 Subject: sched/topology: Simplify build_overlap_sched_groups() Now that the first group will always be the previous domain of this @cpu this can be simplified. In fact, writing the code now removed should've been a big clue I was doing it wrong :/ Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 921dedde2ee1..6b10e0a956c7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -557,7 +557,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + struct sched_group *first = NULL, *last = NULL, *sg; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; @@ -587,15 +587,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) init_overlap_sched_group(sd, sg, i); - /* - * Make sure the first group of this domain contains the - * canonical balance CPU. Otherwise the sched_domain iteration - * breaks. See update_sg_lb_stats(). - */ - if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - group_balance_cpu(sg) == cpu) - groups = sg; - if (!first) first = sg; if (last) @@ -603,7 +594,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) last = sg; last->next = first; } - sd->groups = groups; + sd->groups = first; return 0; -- cgit v1.2.3 From b0151c25548cacc50771a7930475727c6c8ee869 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 17:29:16 +0200 Subject: sched/debug: Print the scheduler topology group mask In order to determine the balance_cpu (for should_we_balance()) we need the sched_group_mask() for overlapping domains. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6b10e0a956c7..3d50ee38b8fb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -82,12 +82,22 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); + + if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) { + printk(KERN_CONT " (mask: %*pbl)", + cpumask_pr_args(sched_group_mask(group))); + } + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %lu)", + printk(KERN_CONT " (cpu_capacity: %lu)", group->sgc->capacity); } group = group->next; + + if (group != sd->groups) + printk(KERN_CONT ","); + } while (group != sd->groups); printk(KERN_CONT "\n"); -- cgit v1.2.3 From a420b0630362c2c451060e6187e36d72df827134 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 18:20:48 +0200 Subject: sched/topology: Verify the first group matches the child domain We want sched_groups to be sibling child domains (or individual CPUs when there are no child domains). Furthermore, since the first group of a domain should include the CPU of that domain, the first group of each domain should match the child domain. Verify this is indeed so. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3d50ee38b8fb..81c82031ed95 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -93,6 +93,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->capacity); } + if (group == sd->groups && sd->child && + !cpumask_equal(sched_domain_span(sd->child), + sched_group_cpus(group))) { + printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); + } + group = group->next; if (group != sd->groups) -- cgit v1.2.3 From f32d782e31bf079f600dcec126ed117b0577e85c Mon Sep 17 00:00:00 2001 From: Lauro Ramos Venancio Date: Thu, 20 Apr 2017 16:51:40 -0300 Subject: sched/topology: Optimize build_group_mask() The group mask is always used in intersection with the group CPUs. So, when building the group mask, we don't have to care about CPUs that are not part of the group. Signed-off-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lwang@redhat.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1492717903-5195-2-git-send-email-lvenanci@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 81c82031ed95..5a4d9aeda258 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -506,12 +506,12 @@ enum s_alloc { */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { - const struct cpumask *span = sched_domain_span(sd); + const struct cpumask *sg_span = sched_group_cpus(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, span) { + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; -- cgit v1.2.3 From c20e1ea4b61c3d99a354d912f2d74822fd2a001d Mon Sep 17 00:00:00 2001 From: Lauro Ramos Venancio Date: Thu, 20 Apr 2017 16:51:42 -0300 Subject: sched/topology: Move comment about asymmetric node setups Signed-off-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lwang@redhat.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1492717903-5195-4-git-send-email-lvenanci@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5a4d9aeda258..c10f44a1ab2d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -495,14 +495,6 @@ enum s_alloc { /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. - * - * Asymmetric node setups can result in situations where the domain tree is of - * unequal depth, make sure to skip domains that already cover the entire - * range. - * - * In that case build_sched_domains() will have terminated the iteration early - * and our sibling sd spans will be empty. Domains should always include the - * CPU they're built on, so check that. */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { @@ -590,7 +582,16 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sibling = *per_cpu_ptr(sdd->sd, i); - /* See the comment near build_group_mask(). */ + /* + * Asymmetric node setups can result in situations where the + * domain tree is of unequal depth, make sure to skip domains + * that already cover the entire range. + * + * In that case build_sched_domains() will have terminated the + * iteration early and our sibling sd spans will be empty. + * Domains should always include the CPU they're built on, so + * check that. + */ if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; -- cgit v1.2.3 From af85596c74de2fd9abb87501ae280038ac28a3f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Apr 2017 17:36:41 +0200 Subject: sched/topology: Remove FORCE_SD_OVERLAP Its an obsolete debug mechanism and future code wants to rely on properties this undermines. Namely, it would be good to assume that SD_OVERLAP domains have children, but if we build the entire hierarchy with SD_OVERLAP this is obviously false. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/features.h | 1 - kernel/sched/topology.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 11192e0cb122..dc4d1483b038 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -76,7 +76,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c10f44a1ab2d..21efacf547d4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1439,7 +1439,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + if (tl->flags & SDTL_OVERLAP) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; -- cgit v1.2.3 From 73bb059f9b8a00c5e1bf2f7ca83138c05d05e600 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2017 14:00:49 +0200 Subject: sched/topology: Fix overlapping sched_group_mask The point of sched_group_mask is to select those CPUs from sched_group_cpus that can actually arrive at this balance domain. The current code gets it wrong, as can be readily demonstrated with a topology like: node 0 1 2 3 0: 10 20 30 20 1: 20 10 20 30 2: 30 20 10 20 3: 20 30 20 10 Where (for example) domain 1 on CPU1 ends up with a mask that includes CPU0: [] CPU1 attaching sched-domain: [] domain 0: span 0-2 level NUMA [] groups: 1 (mask: 1), 2, 0 [] domain 1: span 0-3 level NUMA [] groups: 0-2 (mask: 0-2) (cpu_capacity: 3072), 0,2-3 (cpu_capacity: 3072) This causes sched_balance_cpu() to compute the wrong CPU and consequently should_we_balance() will terminate early resulting in missed load-balance opportunities. The fixed topology looks like: [] CPU1 attaching sched-domain: [] domain 0: span 0-2 level NUMA [] groups: 1 (mask: 1), 2, 0 [] domain 1: span 0-3 level NUMA [] groups: 0-2 (mask: 1) (cpu_capacity: 3072), 0,2-3 (cpu_capacity: 3072) (note: this relies on OVERLAP domains to always have children, this is true because the regular topology domains are still here -- this is before degenerate trimming) Debugged-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 21efacf547d4..09563c1d1d5b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -495,6 +495,9 @@ enum s_alloc { /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. + * + * Only CPUs that can arrive at this group should be considered to continue + * balancing. */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { @@ -505,11 +508,24 @@ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + + /* + * Can happen in the asymmetric case, where these siblings are + * unused. The mask will not be empty because those CPUs that + * do have the top domain _should_ span the domain. + */ + if (!sibling->child) + continue; + + /* If we would not end up here, we can't continue from here */ + if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) continue; cpumask_set_cpu(i, sched_group_mask(sg)); } + + /* We must not have empty masks here */ + WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg))); } /* -- cgit v1.2.3 From 8d5dc5126bb2bbcebf0b1e061cca2fc02c935620 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2017 15:29:40 +0200 Subject: sched/topology: Small cleanup Move the allocation of topology specific cpumasks into the topology code. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +--- kernel/sched/sched.h | 4 +--- kernel/sched/topology.c | 7 +++++-- 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5794f4acad15..dde5d1e860f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5958,7 +5958,6 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); sched_init_numa(); @@ -5968,7 +5967,7 @@ void __init sched_init_smp(void) * happen. */ mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); + sched_init_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -6197,7 +6196,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6dda2aab731e..6e1eae717a24 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -606,11 +606,9 @@ struct root_domain { extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; -extern cpumask_var_t fallback_doms; -extern cpumask_var_t sched_domains_tmpmask; extern void init_defrootdomain(void); -extern int init_sched_domains(const struct cpumask *cpu_map); +extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); #endif /* CONFIG_SMP */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 09563c1d1d5b..a4b868c76f3c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1526,7 +1526,7 @@ static struct sched_domain_attr *dattr_cur; * cpumask) fails, then fallback to a single sched domain, * as determined by the single cpumask fallback_doms. */ -cpumask_var_t fallback_doms; +static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@ -1568,10 +1568,13 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated CPUs, but could be used to * exclude other special cases in the future. */ -int init_sched_domains(const struct cpumask *cpu_map) +int sched_init_domains(const struct cpumask *cpu_map) { int err; + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); + zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); -- cgit v1.2.3 From 005f874dd2843116e2ea079e3679f4f318f12fee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Apr 2017 17:35:35 +0200 Subject: sched/topology: Add sched_group_capacity debugging Add sgc::id to easier spot domain construction issues. Take the opportunity to slightly rework the group printing, because adding more "(id: %d)" strings makes the entire thing very hard to read. Also the individual groups are very hard to separate, so add explicit visual grouping, which allows replacing all the "(%s: %d)" format things with shorter "%s=%d" variants. Then fix up some inconsistencies in surrounding prints for domains. The end result looks like: [] CPU0 attaching sched-domain(s): [] domain-0: span=0,4 level=DIE [] groups: 0:{ span=0 }, 4:{ span=4 } [] domain-1: span=0-1,3-5,7 level=NUMA [] groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 } Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++++ kernel/sched/topology.c | 25 +++++++++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6e1eae717a24..4312b2adfb02 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1023,6 +1023,10 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + unsigned long cpumask[0]; /* iteration mask */ }; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a4b868c76f3c..12af4b157928 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -35,7 +35,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); @@ -45,7 +45,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %*pbl level %s\n", + printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { @@ -80,18 +80,17 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - printk(KERN_CONT " %*pbl", - cpumask_pr_args(sched_group_cpus(group))); + printk(KERN_CONT " %d:{ span=%*pbl", + group->sgc->id, + cpumask_pr_args(sched_group_cpus(group))); if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) { - printk(KERN_CONT " (mask: %*pbl)", + printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(sched_group_mask(group))); } - if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity: %lu)", - group->sgc->capacity); - } + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) + printk(KERN_CONT " cap=%lu", group->sgc->capacity); if (group == sd->groups && sd->child && !cpumask_equal(sched_domain_span(sd->child), @@ -99,6 +98,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } + printk(KERN_CONT " }"); + group = group->next; if (group != sd->groups) @@ -129,7 +130,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) return; } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); for (;;) { if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) @@ -1356,6 +1357,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; +#ifdef CONFIG_SCHED_DEBUG + sgc->id = j; +#endif + *per_cpu_ptr(sdd->sgc, j) = sgc; } } -- cgit v1.2.3 From 1676330ecfa840113a37b25a49afda068380d19c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2017 14:31:11 +0200 Subject: sched/topology: Fix overlapping sched_group_capacity When building the overlapping groups we need to attach a consistent sched_group_capacity structure. That is, all 'identical' sched_group's should have the _same_ sched_group_capacity. This can (once again) be demonstrated with a topology like: node 0 1 2 3 0: 10 20 30 20 1: 20 10 20 30 2: 30 20 10 20 3: 20 30 20 10 But we need at least 2 CPUs per node for this to show up, after all, if there is only one CPU per node, our CPU @i is per definition a unique CPU that reaches this domain (aka balance-cpu). Given the above NUMA topo and 2 CPUs per node: [] CPU0 attaching sched-domain(s): [] domain-0: span=0,4 level=DIE [] groups: 0:{ span=0 }, 4:{ span=4 } [] domain-1: span=0-1,3-5,7 level=NUMA [] groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 } [] CPU1 attaching sched-domain(s): [] domain-0: span=1,5 level=DIE [] groups: 1:{ span=1 }, 5:{ span=5 } [] domain-1: span=0-2,4-6 level=NUMA [] groups: 1:{ span=1,5 mask=1,5 cap=2048 }, 2:{ span=2,6 mask=2,6 cap=2048 }, 4:{ span=0,4 mask=0,4 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 1:{ span=0-2,4-6 mask=1,5 cap=6144 }, 3:{ span=0,2-4,6-7 mask=3,7 cap=6144 } Observe how CPU0-domain1-group0 and CPU1-domain1-group4 are the 'same' but have a different id (0 vs 4). To fix this, use the group balance CPU to select the SGC. This means we have to compute the full mask for each CPU and require a second temporary mask to store the group mask in (it otherwise lives in the SGC). The fixed topology looks like: [] CPU0 attaching sched-domain(s): [] domain-0: span=0,4 level=DIE [] groups: 0:{ span=0 }, 4:{ span=4 } [] domain-1: span=0-1,3-5,7 level=NUMA [] groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 } [] CPU1 attaching sched-domain(s): [] domain-0: span=1,5 level=DIE [] groups: 1:{ span=1 }, 5:{ span=5 } [] domain-1: span=0-2,4-6 level=NUMA [] groups: 1:{ span=1,5 mask=1,5 cap=2048 }, 2:{ span=2,6 mask=2,6 cap=2048 }, 0:{ span=0,4 mask=0,4 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 1:{ span=0-2,4-6 mask=1,5 cap=6144 }, 3:{ span=0,2-4,6-7 mask=3,7 cap=6144 } Debugged-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 12af4b157928..4f6fa7553d92 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ cpumask_var_t sched_domains_tmpmask; +cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG @@ -500,13 +501,16 @@ enum s_alloc { * Only CPUs that can arrive at this group should be considered to continue * balancing. */ -static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +static void +build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) { const struct cpumask *sg_span = sched_group_cpus(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; + cpumask_clear(mask); + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); @@ -522,11 +526,11 @@ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) continue; - cpumask_set_cpu(i, sched_group_mask(sg)); + cpumask_set_cpu(i, mask); } /* We must not have empty masks here */ - WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg))); + WARN_ON_ONCE(cpumask_empty(mask)); } /* @@ -560,14 +564,19 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) } static void init_overlap_sched_group(struct sched_domain *sd, - struct sched_group *sg, int cpu) + struct sched_group *sg) { + struct cpumask *mask = sched_domains_tmpmask2; struct sd_data *sdd = sd->private; struct cpumask *sg_span; + int cpu; + + build_group_mask(sd, sg, mask); + cpu = cpumask_first_and(sched_group_cpus(sg), mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); + cpumask_copy(sched_group_mask(sg), mask); /* * Initialize sgc->capacity such that even if we mess up the @@ -619,7 +628,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sg_span = sched_group_cpus(sg); cpumask_or(covered, covered, sg_span); - init_overlap_sched_group(sd, sg, i); + init_overlap_sched_group(sd, sg); if (!first) first = sg; @@ -1578,6 +1587,7 @@ int sched_init_domains(const struct cpumask *cpu_map) int err; zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); + zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); arch_update_cpu_topology(); -- cgit v1.2.3 From 35a566e6e8a18c3bc16229abeac146a707b8f216 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Apr 2017 10:54:26 +0200 Subject: sched/topology: Add a few comments Try and describe what this code is about.. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 193 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4f6fa7553d92..b2790830e184 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -494,12 +494,128 @@ enum s_alloc { sa_none, }; +/* + * Return the canonical balance CPU for this group, this is the first CPU + * of this group that's also in the iteration mask. + * + * The iteration mask are all those CPUs that could actually end up at this + * group. See build_group_mask(). + * + * Also see should_we_balance(). + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + + +/* + * NUMA topology (first read the regular topology blurb below) + * + * Given a node-distance table, for example: + * + * node 0 1 2 3 + * 0: 10 20 30 20 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 20 30 20 10 + * + * which represents a 4 node ring topology like: + * + * 0 ----- 1 + * | | + * | | + * | | + * 3 ----- 2 + * + * We want to construct domains and groups to represent this. The way we go + * about doing this is to build the domains on 'hops'. For each NUMA level we + * construct the mask of all nodes reachable in @level hops. + * + * For the above NUMA topology that gives 3 levels: + * + * NUMA-2 0-3 0-3 0-3 0-3 + * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} + * + * NUMA-1 0-1,3 0-2 1-3 0,2-3 + * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} + * + * NUMA-0 0 1 2 3 + * + * + * As can be seen; things don't nicely line up as with the regular topology. + * When we iterate a domain in child domain chunks some nodes can be + * represented multiple times -- hence the "overlap" naming for this part of + * the topology. + * + * In order to minimize this overlap, we only build enough groups to cover the + * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. + * + * Because: + * + * - the first group of each domain is its child domain; this + * gets us the first 0-1,3 + * - the only uncovered node is 2, who's child domain is 1-3. + * + * However, because of the overlap, computing a unique CPU for each group is + * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both + * groups include the CPUs of Node-0, while those CPUs would not in fact ever + * end up at those groups (they would end up in group: 0-1,3). + * + * To correct this we have to introduce the group iteration mask. This mask + * will contain those CPUs in the group that can reach this group given the + * (child) domain tree. + * + * With this we can once again compute balance_cpu and sched_group_capacity + * relations. + * + * XXX include words on how balance_cpu is unique and therefore can be + * used for sched_group_capacity links. + * + * + * Another 'interesting' topology is: + * + * node 0 1 2 3 + * 0: 10 20 20 30 + * 1: 20 10 20 20 + * 2: 20 20 10 20 + * 3: 30 20 20 10 + * + * Which looks a little like: + * + * 0 ----- 1 + * | / | + * | / | + * | / | + * 2 ----- 3 + * + * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 + * are not. + * + * This leads to a few particularly weird cases where the sched_domain's are + * not of the same number for each cpu. Consider: + * + * NUMA-2 0-3 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-1 0-2 0-3 0-3 1-3 + * + * NUMA-0 0 1 2 3 + * + */ + + /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. * * Only CPUs that can arrive at this group should be considered to continue * balancing. + * + * We do this during the group creation pass, therefore the group information + * isn't complete yet, however since each group represents a (child) domain we + * can fully construct this using the sched_domain bits (which are already + * complete). */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) @@ -534,14 +650,10 @@ build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask } /* - * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. + * XXX: This creates per-node group entries; since the load-balancer will + * immediately access remote memory to construct this group's load-balance + * statistics having the groups node local is of dubious benefit. */ -int group_balance_cpu(struct sched_group *sg) -{ - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); -} - static struct sched_group * build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) { @@ -577,6 +689,8 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) cpumask_copy(sched_group_mask(sg), mask); + else + WARN_ON_ONCE(!cpumask_equal(sched_group_mask(sg), mask)); /* * Initialize sgc->capacity such that even if we mess up the @@ -647,6 +761,78 @@ fail: return -ENOMEM; } + +/* + * Package topology (also see the load-balance blurb in fair.c) + * + * The scheduler builds a tree structure to represent a number of important + * topology features. By default (default_topology[]) these include: + * + * - Simultaneous multithreading (SMT) + * - Multi-Core Cache (MC) + * - Package (DIE) + * + * Where the last one more or less denotes everything up to a NUMA node. + * + * The tree consists of 3 primary data structures: + * + * sched_domain -> sched_group -> sched_group_capacity + * ^ ^ ^ ^ + * `-' `-' + * + * The sched_domains are per-cpu and have a two way link (parent & child) and + * denote the ever growing mask of CPUs belonging to that level of topology. + * + * Each sched_domain has a circular (double) linked list of sched_group's, each + * denoting the domains of the level below (or individual CPUs in case of the + * first domain level). The sched_group linked by a sched_domain includes the + * CPU of that sched_domain [*]. + * + * Take for instance a 2 threaded, 2 core, 2 cache cluster part: + * + * CPU 0 1 2 3 4 5 6 7 + * + * DIE [ ] + * MC [ ] [ ] + * SMT [ ] [ ] [ ] [ ] + * + * - or - + * + * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 + * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 + * + * CPU 0 1 2 3 4 5 6 7 + * + * One way to think about it is: sched_domain moves you up and down among these + * topology levels, while sched_group moves you sideways through it, at child + * domain granularity. + * + * sched_group_capacity ensures each unique sched_group has shared storage. + * + * There are two related construction problems, both require a CPU that + * uniquely identify each group (for a given domain): + * + * - The first is the balance_cpu (see should_we_balance() and the + * load-balance blub in fair.c); for each group we only want 1 CPU to + * continue balancing at a higher domain. + * + * - The second is the sched_group_capacity; we want all identical groups + * to share a single sched_group_capacity. + * + * Since these topologies are exclusive by construction. That is, its + * impossible for an SMT thread to belong to multiple cores, and cores to + * be part of multiple caches. There is a very clear and unique location + * for each CPU in the hierarchy. + * + * Therefore computing a unique CPU for each group is trivial (the iteration + * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ + * group), we can simply pick the first CPU in each group. + * + * + * [*] in other words, the first group of each domain is its child domain. + */ + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); -- cgit v1.2.3 From 0c0e776a9b0f21ac41d4c6982c57493928524dba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 May 2017 14:18:06 +0200 Subject: sched/topology: Rewrite get_group() We want to attain: sg_cpus() & sg_mask() == sg_mask() for this to be so we must initialize sg_mask() to sg_cpus() for the !overlap case (its currently cpumask_setall()). Since the code makes my head hurt bad, rewrite it into a simpler form, inspired by the now fixed overlap code. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b2790830e184..dea1950b42a5 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -833,23 +833,34 @@ fail: * [*] in other words, the first group of each domain is its child domain. */ -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) +static struct sched_group *get_group(int cpu, struct sd_data *sdd) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; + struct sched_group *sg; if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + sg = *per_cpu_ptr(sdd->sg, cpu); + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + + /* For claim_allocations: */ + atomic_inc(&sg->ref); + atomic_inc(&sg->sgc->ref); - /* For claim_allocations: */ - atomic_set(&(*sg)->sgc->ref, 1); + if (child) { + cpumask_copy(sched_group_cpus(sg), sched_domain_span(child)); + cpumask_copy(sched_group_mask(sg), sched_group_cpus(sg)); + } else { + cpumask_set_cpu(cpu, sched_group_cpus(sg)); + cpumask_set_cpu(cpu, sched_group_mask(sg)); } - return cpu; + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg)); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + + return sg; } /* @@ -868,34 +879,20 @@ build_sched_groups(struct sched_domain *sd, int cpu) struct cpumask *covered; int i; - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(span)) - return 0; - lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct sched_group *sg; - int group, j; if (cpumask_test_cpu(i, covered)) continue; - group = get_group(i, sdd, &sg); - cpumask_setall(sched_group_mask(sg)); + sg = get_group(i, sdd); - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } + cpumask_or(covered, covered, sched_group_cpus(sg)); if (!first) first = sg; @@ -904,6 +901,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) last = sg; } last->next = first; + sd->groups = first; return 0; } -- cgit v1.2.3 From af218122b103900fa33d408aea0c2468791e698c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 May 2017 08:51:05 +0200 Subject: sched/topology: Simplify sched_group_mask() usage While writing the comments, it occurred to me that: sg_cpus & sg_mask == sg_mask at least conceptually; the !overlap case sets the all 1s mask. If we correct that we can simplify things and directly use sg_mask. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++---- kernel/sched/topology.c | 5 +++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f80c825e2b43..1eb32d4513ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7996,7 +7996,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_cpus, *sg_mask; + struct cpumask *sg_mask; int cpu, balance_cpu = -1; /* @@ -8006,11 +8006,10 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_cpus = sched_group_cpus(sg); sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + for_each_cpu_and(cpu, sg_mask, env->cpus) { + if (!idle_cpu(cpu)) continue; balance_cpu = cpu; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dea1950b42a5..bf53a99eb511 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -85,7 +85,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->id, cpumask_pr_args(sched_group_cpus(group))); - if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) { + if ((sd->flags & SD_OVERLAP) && + !cpumask_equal(sched_group_mask(group), sched_group_cpus(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(sched_group_mask(group))); } @@ -505,7 +506,7 @@ enum s_alloc { */ int group_balance_cpu(struct sched_group *sg) { - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); + return cpumask_first(sched_group_mask(sg)); } -- cgit v1.2.3 From e5c14b1fb89213ff718261e6fb1bb29c5ffbbe99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 May 2017 10:47:02 +0200 Subject: sched/topology: Rename sched_group_mask() Since sched_group_mask() is now an independent cpumask (it no longer masks sched_group_cpus()), rename the thing. Suggested-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- kernel/sched/sched.h | 7 +++---- kernel/sched/topology.c | 33 +++++++++++++++------------------ 3 files changed, 19 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1eb32d4513ea..a7d84c8a7881 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7996,7 +7996,6 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_mask; int cpu, balance_cpu = -1; /* @@ -8006,9 +8005,8 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_mask, env->cpus) { + for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { if (!idle_cpu(cpu)) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4312b2adfb02..f7c70575ae34 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1027,7 +1027,7 @@ struct sched_group_capacity { int id; #endif - unsigned long cpumask[0]; /* iteration mask */ + unsigned long cpumask[0]; /* balance mask */ }; struct sched_group { @@ -1054,10 +1054,9 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) } /* - * cpumask masking which cpus in the group are allowed to iterate up the domain - * tree. + * See build_balance_mask(). */ -static inline struct cpumask *sched_group_mask(struct sched_group *sg) +static inline struct cpumask *group_balance_mask(struct sched_group *sg) { return to_cpumask(sg->sgc->cpumask); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index bf53a99eb511..070191f02035 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -86,9 +86,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_pr_args(sched_group_cpus(group))); if ((sd->flags & SD_OVERLAP) && - !cpumask_equal(sched_group_mask(group), sched_group_cpus(group))) { + !cpumask_equal(group_balance_mask(group), sched_group_cpus(group))) { printk(KERN_CONT " mask=%*pbl", - cpumask_pr_args(sched_group_mask(group))); + cpumask_pr_args(group_balance_mask(group))); } if (group->sgc->capacity != SCHED_CAPACITY_SCALE) @@ -497,16 +497,16 @@ enum s_alloc { /* * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. + * of this group that's also in the balance mask. * - * The iteration mask are all those CPUs that could actually end up at this - * group. See build_group_mask(). + * The balance mask are all those CPUs that could actually end up at this + * group. See build_balance_mask(). * * Also see should_we_balance(). */ int group_balance_cpu(struct sched_group *sg) { - return cpumask_first(sched_group_mask(sg)); + return cpumask_first(group_balance_mask(sg)); } @@ -563,7 +563,7 @@ int group_balance_cpu(struct sched_group *sg) * groups include the CPUs of Node-0, while those CPUs would not in fact ever * end up at those groups (they would end up in group: 0-1,3). * - * To correct this we have to introduce the group iteration mask. This mask + * To correct this we have to introduce the group balance mask. This mask * will contain those CPUs in the group that can reach this group given the * (child) domain tree. * @@ -607,11 +607,8 @@ int group_balance_cpu(struct sched_group *sg) /* - * Build an iteration mask that can exclude certain CPUs from the upwards - * domain traversal. - * - * Only CPUs that can arrive at this group should be considered to continue - * balancing. + * Build the balance mask; it contains only those CPUs that can arrive at this + * group and should be considered to continue balancing. * * We do this during the group creation pass, therefore the group information * isn't complete yet, however since each group represents a (child) domain we @@ -619,7 +616,7 @@ int group_balance_cpu(struct sched_group *sg) * complete). */ static void -build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) +build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) { const struct cpumask *sg_span = sched_group_cpus(sg); struct sd_data *sdd = sd->private; @@ -684,14 +681,14 @@ static void init_overlap_sched_group(struct sched_domain *sd, struct cpumask *sg_span; int cpu; - build_group_mask(sd, sg, mask); + build_balance_mask(sd, sg, mask); cpu = cpumask_first_and(sched_group_cpus(sg), mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) - cpumask_copy(sched_group_mask(sg), mask); + cpumask_copy(group_balance_mask(sg), mask); else - WARN_ON_ONCE(!cpumask_equal(sched_group_mask(sg), mask)); + WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); /* * Initialize sgc->capacity such that even if we mess up the @@ -852,10 +849,10 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) if (child) { cpumask_copy(sched_group_cpus(sg), sched_domain_span(child)); - cpumask_copy(sched_group_mask(sg), sched_group_cpus(sg)); + cpumask_copy(group_balance_mask(sg), sched_group_cpus(sg)); } else { cpumask_set_cpu(cpu, sched_group_cpus(sg)); - cpumask_set_cpu(cpu, sched_group_mask(sg)); + cpumask_set_cpu(cpu, group_balance_mask(sg)); } sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg)); -- cgit v1.2.3 From ae4df9d6c935105857d9d166b615e3f17531ce6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 May 2017 11:03:12 +0200 Subject: sched/topology: Rename sched_group_cpus() There's a discrepancy in naming between the sched_domain and sched_group cpumask accessor. Since we're doing changes, fix it. $ git grep sched_group_cpus | wc -l 28 $ git grep sched_domain_span | wc -l 38 Suggests changing sched_group_cpus() into sched_group_span(): for i in `git grep -l sched_group_cpus` do sed -ie 's/sched_group_cpus/sched_group_span/g' $i done Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++---------- kernel/sched/sched.h | 4 ++-- kernel/sched/topology.c | 38 +++++++++++++++++++------------------- 3 files changed, 31 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a7d84c8a7881..eede181b4530 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5484,12 +5484,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), + if (!cpumask_intersects(sched_group_span(group), &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + sched_group_span(group)); /* * Tally up the load of all CPUs in the group and find @@ -5499,7 +5499,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, runnable_load = 0; max_spare_cap = 0; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu(i, sched_group_span(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -5602,10 +5602,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Check if we have any choice: */ if (group->group_weight == 1) - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -7192,7 +7192,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { + for_each_cpu(cpu, sched_group_span(sdg)) { struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); @@ -7371,7 +7371,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); /* Bias balancing toward cpus of our domain */ @@ -7535,7 +7535,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *sgs = &tmp_sgs; int local_group; - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); if (local_group) { sds->local = sg; sgs = local; @@ -7890,7 +7890,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -8043,7 +8043,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), + .dst_grpmask = sched_group_span(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f7c70575ae34..f8cf1d87f065 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1048,7 +1048,7 @@ struct sched_group { unsigned long cpumask[0]; }; -static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +static inline struct cpumask *sched_group_span(struct sched_group *sg) { return to_cpumask(sg->cpumask); } @@ -1067,7 +1067,7 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) */ static inline unsigned int group_first_cpu(struct sched_group *group) { - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); } extern int group_balance_cpu(struct sched_group *sg); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 070191f02035..79895aec281e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -53,7 +53,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + if (!cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -66,27 +66,27 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_cpus(group))) { + if (!cpumask_weight(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } if (!(sd->flags & SD_OVERLAP) && - cpumask_intersects(groupmask, sched_group_cpus(group))) { + cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + cpumask_or(groupmask, groupmask, sched_group_span(group)); printk(KERN_CONT " %d:{ span=%*pbl", group->sgc->id, - cpumask_pr_args(sched_group_cpus(group))); + cpumask_pr_args(sched_group_span(group))); if ((sd->flags & SD_OVERLAP) && - !cpumask_equal(group_balance_mask(group), sched_group_cpus(group))) { + !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); } @@ -96,7 +96,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (group == sd->groups && sd->child && !cpumask_equal(sched_domain_span(sd->child), - sched_group_cpus(group))) { + sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } @@ -618,7 +618,7 @@ int group_balance_cpu(struct sched_group *sg) static void build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) { - const struct cpumask *sg_span = sched_group_cpus(sg); + const struct cpumask *sg_span = sched_group_span(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; @@ -664,7 +664,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) if (!sg) return NULL; - sg_span = sched_group_cpus(sg); + sg_span = sched_group_span(sg); if (sd->child) cpumask_copy(sg_span, sched_domain_span(sd->child)); else @@ -682,7 +682,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, int cpu; build_balance_mask(sd, sg, mask); - cpu = cpumask_first_and(sched_group_cpus(sg), mask); + cpu = cpumask_first_and(sched_group_span(sg), mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) @@ -695,7 +695,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg_span = sched_group_cpus(sg); + sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; } @@ -737,7 +737,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (!sg) goto fail; - sg_span = sched_group_cpus(sg); + sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); init_overlap_sched_group(sd, sg); @@ -848,14 +848,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) atomic_inc(&sg->sgc->ref); if (child) { - cpumask_copy(sched_group_cpus(sg), sched_domain_span(child)); - cpumask_copy(group_balance_mask(sg), sched_group_cpus(sg)); + cpumask_copy(sched_group_span(sg), sched_domain_span(child)); + cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); } else { - cpumask_set_cpu(cpu, sched_group_cpus(sg)); + cpumask_set_cpu(cpu, sched_group_span(sg)); cpumask_set_cpu(cpu, group_balance_mask(sg)); } - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg)); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; return sg; @@ -890,7 +890,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) sg = get_group(i, sdd); - cpumask_or(covered, covered, sched_group_cpus(sg)); + cpumask_or(covered, covered, sched_group_span(sg)); if (!first) first = sg; @@ -923,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) do { int cpu, max_cpu = -1; - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg->group_weight = cpumask_weight(sched_group_span(sg)); if (!(sd->flags & SD_ASYM_PACKING)) goto next; - for_each_cpu(cpu, sched_group_cpus(sg)) { + for_each_cpu(cpu, sched_group_span(sg)) { if (max_cpu < 0) max_cpu = cpu; else if (sched_asym_prefer(cpu, max_cpu)) -- cgit v1.2.3 From 502ce005ab95d5d9481768649dbab808845b24d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 May 2017 15:31:22 +0200 Subject: sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs In order to allow leaf_cfs_rq_list to remove entries switch the bandwidth hotplug code over to the task_groups list. Suggested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Mason Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170504133122.a6qjlj3hlblbjxux@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eede181b4530..c8fa777de76c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4642,24 +4642,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +/* + * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * + * The race is harmless, since modifying bandwidth settings of unhooked group + * bits doesn't do much. + */ + +/* cpu online calback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; raw_spin_unlock(&cfs_b->lock); } + rcu_read_unlock(); } +/* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; + + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - for_each_leaf_cfs_rq(rq, cfs_rq) { if (!cfs_rq->runtime_enabled) continue; @@ -4677,6 +4696,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } + rcu_read_unlock(); } #else /* CONFIG_CFS_BANDWIDTH */ -- cgit v1.2.3 From a9e7f6544b9cebdae54d29f87a7ba2a83c0471b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Apr 2017 17:43:50 -0700 Subject: sched/fair: Fix O(nr_cgroups) in load balance path Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all live cfs_rqs which have ever been active on the CPU; unfortunately, this makes update_blocked_averages() O(# total cgroups) which isn't scalable at all. This shows up as a small CPU consumption and scheduling latency increase in the load balancing path in systems with CPU controller enabled across most cgroups. In an edge case where temporary cgroups were leaking, this caused the kernel to consume good several tens of percents of CPU cycles running update_blocked_averages(), each run taking multiple millisecs. This patch fixes the issue by taking empty and fully decayed cfs_rqs off the rq->leaf_cfs_rq_list. Signed-off-by: Tejun Heo [ Added cfs_rq_is_decayed() ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Chris Mason Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c8fa777de76c..219fe58e3023 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } /* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -6953,10 +6954,28 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; rq_lock_irqsave(rq, &rf); @@ -6966,7 +6985,7 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; /* throttled entities do not contribute to load */ @@ -6980,6 +6999,13 @@ static void update_blocked_averages(int cpu) se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) update_load_avg(se, 0); + + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); } rq_unlock_irqrestore(rq, &rf); } @@ -9503,10 +9529,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } -- cgit v1.2.3 From 7b4ff1adb57ad96d8f12a05d8c661a3d8c4d2be1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 11 May 2017 10:17:45 -0300 Subject: mutex, futex: adjust kernel-doc markups to generate ReST There are a few issues on some kernel-doc markups that was causing troubles with kernel-doc output on ReST format: ./kernel/futex.c:492: WARNING: Inline emphasis start-string without end-string. ./kernel/futex.c:1264: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:1721: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2338: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2426: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2899: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2972: WARNING: Block quote ends without a blank line; unexpected unindent. Fix them. No functional changes. Acked-by: Darren Hart (VMware) Signed-off-by: Mauro Carvalho Chehab --- kernel/futex.c | 40 ++++++++++++++++++++-------------------- kernel/locking/mutex.c | 6 +++--- 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..b8ae87d227da 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key) * * Return: a negative error code or 0 * - * The key words are stored in *key on success. + * The key words are stored in @key on success. * * For shared mappings, it's (page->index, file_inode(vma->vm_file), * offset_within_page). For private mappings, it's (uaddr, current->mm). @@ -1259,9 +1259,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Return: - * 0 - ready to wait; - * 1 - acquired the lock; - * <0 - error + * - 0 - ready to wait; + * - 1 - acquired the lock; + * - <0 - error * * The hb->lock and futex_key refs shall be held by the caller. */ @@ -1717,9 +1717,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * hb1 and hb2 must be held by the caller. * * Return: - * 0 - failed to acquire the lock atomically; - * >0 - acquired the lock, return value is vpid of the top_waiter - * <0 - error + * - 0 - failed to acquire the lock atomically; + * - >0 - acquired the lock, return value is vpid of the top_waiter + * - <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, @@ -1785,8 +1785,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * uaddr2 atomically on behalf of the top waiter. * * Return: - * >=0 - on success, the number of tasks requeued or woken; - * <0 - on error + * - >=0 - on success, the number of tasks requeued or woken; + * - <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_requeue, @@ -2142,8 +2142,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * be paired with exactly one earlier call to queue_me(). * * Return: - * 1 - if the futex_q was still queued (and we removed unqueued it); - * 0 - if the futex_q was already removed by the waking thread + * - 1 - if the futex_q was still queued (and we removed unqueued it); + * - 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) { @@ -2333,9 +2333,9 @@ static long futex_wait_restart(struct restart_block *restart); * acquire the lock. Must be called with the hb lock held. * * Return: - * 1 - success, lock taken; - * 0 - success, lock not taken; - * <0 - on error (-EFAULT) + * - 1 - success, lock taken; + * - 0 - success, lock not taken; + * - <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { @@ -2422,8 +2422,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * with no q.key reference on failure. * * Return: - * 0 - uaddr contains val and hb has been locked; - * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + * - 0 - uaddr contains val and hb has been locked; + * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) @@ -2895,8 +2895,8 @@ pi_faulted: * called with the hb lock held. * * Return: - * 0 = no early wakeup detected; - * <0 = -ETIMEDOUT or -ERESTARTNOINTR + * - 0 = no early wakeup detected; + * - <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2968,8 +2968,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * If 4 or 7, we cleanup and return with -ETIMEDOUT. * * Return: - * 0 - On success; - * <0 - On error + * - 0 - On success; + * - <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 198527a62149..858a07590e39 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock); * (or statically defined) before it can be locked. memset()-ing * the mutex to 0 is not allowed. * - * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging - * checks that will enforce the restrictions and will also do - * deadlock debugging. ) + * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging + * checks that will enforce the restrictions and will also do + * deadlock debugging) * * This function is similar to (but not equivalent to) down(). */ -- cgit v1.2.3 From c0c6e0850514c16814c37f64a9a1bcc6c52a19ab Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 14 May 2017 12:03:39 -0300 Subject: irq: update genericirq book location This book got converted from DocBook. Update its references to point to the current location. Signed-off-by: Mauro Carvalho Chehab --- kernel/irq/chip.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/irqdesc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 686be4b73018..4188a0a7691f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -7,7 +7,7 @@ * This file contains the core interrupt handling code, for irq-chip * based architectures. * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst */ #include diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d3f24905852c..bbf9a7174283 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -6,7 +6,7 @@ * * This file contains the core interrupt handling code. * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst * */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 00bb0aeea1d0..22e443133987 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -4,7 +4,7 @@ * * This file contains the interrupt descriptor management code * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst * */ #include -- cgit v1.2.3 From ce6cf9a15d62fd7ee92f4f9bb754883bacf85a3e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 May 2017 16:36:19 +0200 Subject: nohz: Add hrtimer sanity check Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..d212bb62bc08 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -771,8 +771,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && (expires == dev->next_event)) - goto out; + if (ts->tick_stopped) { + if (hrtimer_active(&ts->sched_timer)) + WARN_ON_ONCE(hrtimer_get_expires(&ts->sched_timer) < dev->next_event); + + if (expires == dev->next_event) + goto out; + } /* * nohz_stop_sched_tick can be called several times before -- cgit v1.2.3 From 411fe24e6b7c283c3a1911450cdba6dd3aaea56e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 21 Apr 2017 16:00:54 +0200 Subject: nohz: Fix collision between tick and other hrtimers, again This restores commit: 24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers") ... which got reverted by commit: 558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"') ... due to a regression where CPUs spuriously stopped ticking. The bug happened when a tick fired too early past its expected expiration: on IRQ exit the tick was scheduled again to the same deadline but skipped reprogramming because ts->next_tick still kept in cache the deadline. This has been fixed now with resetting ts->next_tick from the tick itself. Extra care has also been taken to prevent from obsolete values throughout CPU hotplug operations. When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-and-tested-by: Tim Wright Reported-and-tested-by: Pavel Machek Reported-by: James Hartsock Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 37 +++++++++++++++++++++++++++++++------ kernel/time/tick-sched.h | 2 ++ 2 files changed, 33 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d212bb62bc08..764d2905e6a5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; + /* + * In case the current tick fired too early past its expected + * expiration, make sure we don't bypass the next clock reprogramming + * to the same deadline. + */ + ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); @@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + + /* + * Reset to make sure next tick stop doesn't get fooled by past + * cached clock deadline. + */ + ts->next_tick = 0; } static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, @@ -771,12 +783,15 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped) { - if (hrtimer_active(&ts->sched_timer)) - WARN_ON_ONCE(hrtimer_get_expires(&ts->sched_timer) < dev->next_event); - - if (expires == dev->next_event) + if (ts->tick_stopped && (expires == ts->next_tick)) { + /* Sanity check: make sure clockevent is actually programmed */ + if (likely(dev->next_event <= ts->next_tick)) goto out; + + WARN_ON_ONCE(1); + printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", + basemono, ts->next_tick, dev->next_event, + hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* @@ -796,6 +811,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, trace_tick_stop(1, TICK_DEP_MASK_NONE); } + ts->next_tick = tick; + /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. @@ -811,7 +828,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, else tick_program_event(tick, 1); out: - /* Update the estimated sleep length */ + /* + * Update the estimated sleep length until the next timer + * (not only the tick). + */ ts->sleep_length = ktime_sub(dev->next_event, now); return tick; } @@ -869,6 +889,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + /* + * Make sure the CPU doesn't get fooled by obsolete tick + * deadline if it comes back online later. + */ + ts->next_tick = 0; return false; } diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index bf38226e5c17..075444e3d48e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -27,6 +27,7 @@ enum tick_nohz_mode { * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -44,6 +45,7 @@ struct tick_sched { unsigned long check_clocks; enum tick_nohz_mode nohz_mode; ktime_t last_tick; + ktime_t next_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; -- cgit v1.2.3 From 33c35aa4817864e056fd772230b0c6b552e36ea2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 May 2017 09:34:06 -0400 Subject: cgroup: Prevent kill_css() from being called more than once The kill_css() function may be called more than once under the condition that the css was killed but not physically removed yet followed by the removal of the cgroup that is hosting the css. This patch prevents any harmm from being done when that happens. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.5+ --- kernel/cgroup/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c3c9a0e1b3c9..8d4e85eae42c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4265,6 +4265,11 @@ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); + if (css->flags & CSS_DYING) + return; + + css->flags |= CSS_DYING; + /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. -- cgit v1.2.3 From af777cd1b83e95138e7285fde87c795ef0ae7c4d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 13 May 2017 04:51:40 -0700 Subject: doc: ReSTify credentials.txt This updates the credentials API documentation to ReST markup and moves it under the security subsection of kernel API documentation. Cc: David Howells Signed-off-by: Kees Cook Signed-off-by: Jonathan Corbet --- kernel/cred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 2bc66075740f..ecf03657e71c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management - see Documentation/security/credentials.txt +/* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) -- cgit v1.2.3 From 719f6a7040f1bdaf96fcc709d272548facb88e90 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 20 Apr 2017 10:52:31 +0200 Subject: printk: Use the main logbuf in NMI when logbuf_lock is available The commit 42a0bb3f71383b457a7d ("printk/nmi: generic solution for safe printk in NMI") caused that printk stores messages into a temporary buffer in NMI context. The buffer is per-CPU and therefore the size is rather limited. It works quite well for NMI backtraces. But there are longer logs that might get printed in NMI context, for example, lockdep warnings, ftrace_dump_on_oops. The temporary buffer is used to avoid deadlocks caused by logbuf_lock. Also it is needed to avoid races with the other temporary buffer that is used when PRINTK_SAFE_CONTEXT is entered. But the main buffer can be used in NMI if the lock is available and we did not interrupt PRINTK_SAFE_CONTEXT. The lock is checked using raw_spin_is_locked(). It might cause false negatives when the lock is taken on another CPU and this CPU is in the safe context from other reasons. Note that the safe context is used also to get console semaphore or when calling console drivers. For this reason, we do the check in printk_nmi_enter(). It makes the handling consistent for the entire NMI handler and avoids reshuffling of the messages. The patch also defines special printk context that allows to use printk_deferred() in NMI. Note that we could not flush the messages to the consoles because console drivers might use many other internal locks. The newly created vprintk_deferred() disables the preemption only around the irq work handling. It is needed there to keep the consistency between the two per-CPU variables. But there is no reason to disable preemption around vprintk_emit(). Finally, the patch puts back explicit serialization of the NMI backtraces from different CPUs. It was removed by the commit a9edc88093287183ac934b ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). It was not needed because the flushing of the temporary per-CPU buffers was serialized. Link: http://lkml.kernel.org/r/1493912763-24873-1-git-send-email-pmladek@suse.com Cc: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Russell King Cc: Daniel Thompson Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Chris Metcalf Cc: x86@kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Suggested-by: Sergey Senozhatsky Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/internal.h | 6 ++++-- kernel/printk/printk.c | 19 ++++++++++++++----- kernel/printk/printk_safe.c | 26 ++++++++++++++++++++++++-- 3 files changed, 42 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 1db044f808b7..2a7d04049af4 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -18,12 +18,14 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff -#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff +#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000 +#define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; __printf(1, 0) int vprintk_default(const char *fmt, va_list args); +__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); __printf(1, 0) int vprintk_func(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 779479ac9f57..e5278e7d1922 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2722,16 +2722,13 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_deferred(const char *fmt, ...) +int vprintk_deferred(const char *fmt, va_list args) { - va_list args; int r; - preempt_disable(); - va_start(args, fmt); r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); - va_end(args); + preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); @@ -2739,6 +2736,18 @@ int printk_deferred(const char *fmt, ...) return r; } +int printk_deferred(const char *fmt, ...) +{ + va_list args; + int r; + + va_start(args, fmt); + r = vprintk_deferred(fmt, args); + va_end(args); + + return r; +} + /* * printk rate limiting, lifted from the networking subsystem. * diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 033e50a7d706..03a42a539b20 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -308,12 +308,24 @@ static int vprintk_nmi(const char *fmt, va_list args) void printk_nmi_enter(void) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + /* + * The size of the extra per-CPU buffer is limited. Use it only when + * the main one is locked. If this CPU is not in the safe context, + * the lock must be taken on another CPU and we could wait for it. + */ + if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) && + raw_spin_is_locked(&logbuf_lock)) { + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + } else { + this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK); + } } void printk_nmi_exit(void) { - this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); + this_cpu_and(printk_context, + ~(PRINTK_NMI_CONTEXT_MASK | + PRINTK_NMI_DEFERRED_CONTEXT_MASK)); } #else @@ -351,12 +363,22 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { + /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) return vprintk_nmi(fmt, args); + /* Use extra buffer to prevent a recursion deadlock in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) return vprintk_safe(fmt, args); + /* + * Use the main logbuf when logbuf_lock is available in NMI. + * But avoid calling console drivers that might have their own locks. + */ + if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK) + return vprintk_deferred(fmt, args); + + /* No obstacles. */ return vprintk_default(fmt, args); } -- cgit v1.2.3 From 7e95a225901a5d2fd140f14b4302805cecc22da7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 19:52:01 -0400 Subject: move compat wait4 and waitid next to native variants Signed-off-by: Al Viro --- kernel/compat.c | 66 ------------------------------------------------------ kernel/exit.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..b4cdba6bbd02 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -543,72 +543,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) return 0; } -COMPAT_SYSCALL_DEFINE4(wait4, - compat_pid_t, pid, - compat_uint_t __user *, stat_addr, - int, options, - struct compat_rusage __user *, ru) -{ - if (!ru) { - return sys_wait4(pid, stat_addr, options, NULL); - } else { - struct rusage r; - int ret; - unsigned int status; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, - (stat_addr ? - (unsigned int __user *) &status : NULL), - options, (struct rusage __user *) &r); - set_fs (old_fs); - - if (ret > 0) { - if (put_compat_rusage(&r, ru)) - return -EFAULT; - if (stat_addr && put_user(status, stat_addr)) - return -EFAULT; - } - return ret; - } -} - -COMPAT_SYSCALL_DEFINE5(waitid, - int, which, compat_pid_t, pid, - struct compat_siginfo __user *, uinfo, int, options, - struct compat_rusage __user *, uru) -{ - siginfo_t info; - struct rusage ru; - long ret; - mm_segment_t old_fs = get_fs(); - - memset(&info, 0, sizeof(info)); - - set_fs(KERNEL_DS); - ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? (struct rusage __user *)&ru : NULL); - set_fs(old_fs); - - if ((ret < 0) || (info.si_signo == 0)) - return ret; - - if (uru) { - /* sys_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - ret = copy_to_user(uru, &ru, sizeof(ru)); - else - ret = put_compat_rusage(&ru, uru); - if (ret) - return -EFAULT; - } - - BUG_ON(info.si_code & __SI_MASK); - info.si_code |= __SI_CHLD; - return copy_siginfo_to_user32(uinfo, &info); -} - static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, unsigned len, struct cpumask *new_mask) { diff --git a/kernel/exit.c b/kernel/exit.c index 516acdb0e0ec..f98782bd27b6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1735,3 +1736,71 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) } #endif + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(wait4, + compat_pid_t, pid, + compat_uint_t __user *, stat_addr, + int, options, + struct compat_rusage __user *, ru) +{ + if (!ru) { + return sys_wait4(pid, stat_addr, options, NULL); + } else { + struct rusage r; + int ret; + unsigned int status; + mm_segment_t old_fs = get_fs(); + + set_fs (KERNEL_DS); + ret = sys_wait4(pid, + (stat_addr ? + (unsigned int __user *) &status : NULL), + options, (struct rusage __user *) &r); + set_fs (old_fs); + + if (ret > 0) { + if (put_compat_rusage(&r, ru)) + return -EFAULT; + if (stat_addr && put_user(status, stat_addr)) + return -EFAULT; + } + return ret; + } +} + +COMPAT_SYSCALL_DEFINE5(waitid, + int, which, compat_pid_t, pid, + struct compat_siginfo __user *, infop, int, options, + struct compat_rusage __user *, uru) +{ + siginfo_t info; + struct rusage ru; + long ret; + mm_segment_t old_fs = get_fs(); + + memset(&info, 0, sizeof(info)); + + set_fs(KERNEL_DS); + ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, + uru ? (struct rusage __user *)&ru : NULL); + set_fs(old_fs); + + if ((ret < 0) || (info.si_signo == 0)) + return ret; + + if (uru) { + /* sys_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + ret = copy_to_user(uru, &ru, sizeof(ru)); + else + ret = put_compat_rusage(&ru, uru); + if (ret) + return -EFAULT; + } + + BUG_ON(info.si_code & __SI_MASK); + info.si_code |= __SI_CHLD; + return copy_siginfo_to_user32(infop, &info); +} +#endif -- cgit v1.2.3 From ce72a16fa705f960ca2352e95a7c5f4801475e75 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:25:02 -0400 Subject: wait4(2)/waitid(2): separate copying rusage to userland New helpers: kernel_waitid() and kernel_wait4(). sys_waitid(), sys_wait4() and their compat variants switched to those. Copying struct rusage to userland is left to syscall itself. For compat_sys_wait4() that eliminates the use of set_fs() completely. For compat_sys_waitid() it's still needed (for siginfo handling); that will change shortly. Signed-off-by: Al Viro --- kernel/exit.c | 89 ++++++++++++++++++++++++++++++++++------------------------- kernel/sys.c | 16 ++++------- 2 files changed, 58 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f98782bd27b6..d44f12948c5f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1003,7 +1003,7 @@ struct wait_opts { struct siginfo __user *wo_info; int __user *wo_stat; - struct rusage __user *wo_rusage; + struct rusage *wo_rusage; wait_queue_t child_wait; int notask_error; @@ -1054,8 +1054,10 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, pid_t pid, uid_t uid, int why, int status) { struct siginfo __user *infop; - int retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + int retval = 0; + + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; @@ -1182,8 +1184,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_unlock_irq(¤t->sighand->siglock); } - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; if (!retval && wo->wo_stat) @@ -1316,8 +1319,9 @@ unlock_sig: if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; if (!retval && wo->wo_stat) retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); @@ -1377,8 +1381,9 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) sched_annotate_sleep(); if (!wo->wo_info) { - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; put_task_struct(p); if (!retval && wo->wo_stat) retval = put_user(0xffff, wo->wo_stat); @@ -1618,8 +1623,8 @@ end: return retval; } -SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, - infop, int, options, struct rusage __user *, ru) +static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, + int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; @@ -1687,8 +1692,21 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return ret; } -SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, - int, options, struct rusage __user *, ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) +{ + struct rusage r; + long err = kernel_waitid(which, upid, infop, options, ru ? &r : NULL); + + if (!err) { + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } + return err; +} + +static long kernel_wait4(pid_t upid, int __user *stat_addr, + int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; @@ -1724,6 +1742,19 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, return ret; } +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) +{ + struct rusage r; + long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); + + if (err > 0) { + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } + return err; +} + #ifdef __ARCH_WANT_SYS_WAITPID /* @@ -1744,29 +1775,13 @@ COMPAT_SYSCALL_DEFINE4(wait4, int, options, struct compat_rusage __user *, ru) { - if (!ru) { - return sys_wait4(pid, stat_addr, options, NULL); - } else { - struct rusage r; - int ret; - unsigned int status; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, - (stat_addr ? - (unsigned int __user *) &status : NULL), - options, (struct rusage __user *) &r); - set_fs (old_fs); - - if (ret > 0) { - if (put_compat_rusage(&r, ru)) - return -EFAULT; - if (stat_addr && put_user(status, stat_addr)) - return -EFAULT; - } - return ret; + struct rusage r; + long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); + if (err > 0) { + if (ru && put_compat_rusage(&r, ru)) + return -EFAULT; } + return err; } COMPAT_SYSCALL_DEFINE5(waitid, @@ -1782,8 +1797,8 @@ COMPAT_SYSCALL_DEFINE5(waitid, memset(&info, 0, sizeof(info)); set_fs(KERNEL_DS); - ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? (struct rusage __user *)&ru : NULL); + ret = kernel_waitid(which, pid, (siginfo_t __user *)&info, options, + uru ? &ru : NULL); set_fs(old_fs); if ((ret < 0) || (info.si_signo == 0)) diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..dab1a0658a92 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1552,7 +1552,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) r->ru_oublock += task_io_get_oublock(t); } -static void k_getrusage(struct task_struct *p, int who, struct rusage *r) +void getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; @@ -1626,20 +1626,16 @@ out: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } -int getrusage(struct task_struct *p, int who, struct rusage __user *ru) +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { struct rusage r; - k_getrusage(p, who, &r); - return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; -} - -SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) -{ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; - return getrusage(current, who, ru); + + getrusage(current, who, &r); + return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT @@ -1651,7 +1647,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) who != RUSAGE_THREAD) return -EINVAL; - k_getrusage(current, who, &r); + getrusage(current, who, &r); return put_compat_rusage(&r, ru); } #endif -- cgit v1.2.3 From 359566faefa850504d146839d74496f0cf12d3b9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:39:39 -0400 Subject: kernel_wait4()/kernel_waitid(): delay copying status to userland Signed-off-by: Al Viro --- kernel/exit.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d44f12948c5f..94cdccf8e7e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1002,7 +1002,7 @@ struct wait_opts { struct pid *wo_pid; struct siginfo __user *wo_info; - int __user *wo_stat; + int wo_stat; struct rusage *wo_rusage; wait_queue_t child_wait; @@ -1189,8 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; - if (!retval && wo->wo_stat) - retval = put_user(status, wo->wo_stat); + wo->wo_stat = status; infop = wo->wo_info; if (!retval && infop) @@ -1322,8 +1321,7 @@ unlock_sig: if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); retval = 0; - if (!retval && wo->wo_stat) - retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); + wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (!retval && infop) @@ -1383,12 +1381,9 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) if (!wo->wo_info) { if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; put_task_struct(p); - if (!retval && wo->wo_stat) - retval = put_user(0xffff, wo->wo_stat); - if (!retval) - retval = pid; + wo->wo_stat = 0xffff; + retval = pid; } else { retval = wait_noreap_copyout(wo, p, pid, uid, CLD_CONTINUED, SIGCONT); @@ -1662,7 +1657,6 @@ static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, wo.wo_pid = pid; wo.wo_flags = options; wo.wo_info = infop; - wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); @@ -1734,10 +1728,12 @@ static long kernel_wait4(pid_t upid, int __user *stat_addr, wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; - wo.wo_stat = stat_addr; + wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); + if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) + ret = -EFAULT; return ret; } -- cgit v1.2.3 From 67d7ddded322db99f451a7959d56ed6c70a6c4aa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:53:13 -0400 Subject: waitid(2): leave copyout of siginfo to syscall itself have kernel_waitid() collect the information needed for siginfo into a small structure (waitid_info) passed to it; deal with copyout in sys_waitid()/compat_sys_waitid(). Signed-off-by: Al Viro --- kernel/exit.c | 168 ++++++++++++++++++++++------------------------------------ 1 file changed, 64 insertions(+), 104 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 94cdccf8e7e7..42f26480b3cc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -996,12 +996,19 @@ SYSCALL_DEFINE1(exit_group, int, error_code) return 0; } +struct waitid_info { + pid_t pid; + uid_t uid; + int status; + int cause; +}; + struct wait_opts { enum pid_type wo_type; int wo_flags; struct pid *wo_pid; - struct siginfo __user *wo_info; + struct waitid_info *wo_info; int wo_stat; struct rusage *wo_rusage; @@ -1053,8 +1060,7 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, pid_t pid, uid_t uid, int why, int status) { - struct siginfo __user *infop; - int retval = 0; + struct waitid_info *infop; if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); @@ -1062,22 +1068,12 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; if (infop) { - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + infop->cause = why; + infop->pid = pid; + infop->uid = uid; + infop->status = status; } - if (!retval) - retval = pid; - return retval; + return pid; } /* @@ -1088,10 +1084,10 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - int state, retval, status; + int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); - struct siginfo __user *infop; + struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; @@ -1186,36 +1182,22 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) { - int why; - + if (infop) { if ((status & 0x7f) == 0) { - why = CLD_EXITED; - status >>= 8; + infop->cause = CLD_EXITED; + infop->status = status >> 8; } else { - why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - status &= 0x7f; + infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = status & 0x7f; } - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(status, &infop->si_status); + infop->pid = pid; + infop->uid = uid; } - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); @@ -1232,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (state == EXIT_DEAD) release_task(p); - return retval; + return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) @@ -1268,8 +1250,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { - struct siginfo __user *infop; - int retval, exit_code, *p_code, why; + struct waitid_info *infop; + int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; @@ -1320,28 +1302,19 @@ unlock_sig: if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) - retval = put_user((short)why, &infop->si_code); - if (!retval && infop) - retval = put_user(exit_code, &infop->si_status); - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; + if (infop) { + infop->cause = why; + infop->status = exit_code; + infop->pid = pid; + infop->uid = uid; + } put_task_struct(p); - BUG_ON(!retval); - return retval; + BUG_ON(!pid); + return pid; } /* @@ -1618,7 +1591,7 @@ end: return retval; } -static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; @@ -1660,27 +1633,8 @@ static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, wo.wo_rusage = ru; ret = do_wait(&wo); - if (ret > 0) { + if (ret > 0) ret = 0; - } else if (infop) { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!ret) - ret = put_user(0, &infop->si_signo); - if (!ret) - ret = put_user(0, &infop->si_errno); - if (!ret) - ret = put_user(0, &infop->si_code); - if (!ret) - ret = put_user(0, &infop->si_pid); - if (!ret) - ret = put_user(0, &infop->si_uid); - if (!ret) - ret = put_user(0, &infop->si_status); - } put_pid(pid); return ret; @@ -1690,12 +1644,24 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; - long err = kernel_waitid(which, upid, infop, options, ru ? &r : NULL); + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } + if (!infop) + return err; + + if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || + put_user(0, &infop->si_errno) || + put_user((short)info.cause, &infop->si_code) || + put_user(info.pid, &infop->si_pid) || + put_user(info.uid, &infop->si_uid) || + put_user(info.status, &infop->si_status)) + err = -EFAULT; + return err; } @@ -1785,33 +1751,27 @@ COMPAT_SYSCALL_DEFINE5(waitid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { - siginfo_t info; struct rusage ru; - long ret; - mm_segment_t old_fs = get_fs(); - - memset(&info, 0, sizeof(info)); + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); - set_fs(KERNEL_DS); - ret = kernel_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? &ru : NULL); - set_fs(old_fs); - - if ((ret < 0) || (info.si_signo == 0)) - return ret; - - if (uru) { - /* sys_waitid() overwrites everything in ru */ + if (!err && uru) { + /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) - ret = copy_to_user(uru, &ru, sizeof(ru)); + err = copy_to_user(uru, &ru, sizeof(ru)); else - ret = put_compat_rusage(&ru, uru); - if (ret) + err = put_compat_rusage(&ru, uru); + if (err) return -EFAULT; } - BUG_ON(info.si_code & __SI_MASK); - info.si_code |= __SI_CHLD; - return copy_siginfo_to_user32(infop, &info); + if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || + put_user(0, &infop->si_errno) || + put_user((short)info.cause, &infop->si_code) || + put_user(info.pid, &infop->si_pid) || + put_user(info.uid, &infop->si_uid) || + put_user(info.status, &infop->si_status)) + err = -EFAULT; + return err; } #endif -- cgit v1.2.3 From e61a250229fbf0f003e93676bf4d8a555a8c9eec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:25:03 -0400 Subject: lift getrusage() from wait_noreap_copyout() Signed-off-by: Al Viro --- kernel/exit.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 42f26480b3cc..d4f5097da85a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1062,9 +1062,6 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, { struct waitid_info *infop; - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - put_task_struct(p); infop = wo->wo_info; if (infop) { @@ -1099,6 +1096,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; @@ -1296,12 +1295,12 @@ unlock_sig: why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; @@ -1350,10 +1349,10 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if (!wo->wo_info) { - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); wo->wo_stat = 0xffff; retval = pid; -- cgit v1.2.3 From bb380ec33a7d8ee048e722889627869d21a5d527 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:33:21 -0400 Subject: kill wait_noreap_copyout() folds into callers Signed-off-by: Al Viro --- kernel/exit.c | 65 +++++++++++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d4f5097da85a..f01ebaab978a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1057,22 +1057,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) return 1; } -static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, - pid_t pid, uid_t uid, int why, int status) -{ - struct waitid_info *infop; - - put_task_struct(p); - infop = wo->wo_info; - if (infop) { - infop->cause = why; - infop->pid = pid; - infop->uid = uid; - infop->status = status; - } - return pid; -} - /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold @@ -1091,22 +1075,27 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (unlikely(wo->wo_flags & WNOWAIT)) { int exit_code = p->exit_code; - int why; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if ((exit_code & 0x7f) == 0) { - why = CLD_EXITED; - status = exit_code >> 8; - } else { - why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - status = exit_code & 0x7f; + infop = wo->wo_info; + if (infop) { + if ((exit_code & 0x7f) == 0) { + infop->cause = CLD_EXITED; + infop->status = exit_code >> 8; + } else { + infop->cause = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = exit_code & 0x7f; + } + infop->pid = pid; + infop->uid = uid; } - return wait_noreap_copyout(wo, p, pid, uid, why, status); + return pid; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. @@ -1297,11 +1286,10 @@ unlock_sig: sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if (unlikely(wo->wo_flags & WNOWAIT)) - return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - - wo->wo_stat = (exit_code << 8) | 0x7f; + if (likely(!(wo->wo_flags & WNOWAIT))) + wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { @@ -1310,9 +1298,6 @@ unlock_sig: infop->pid = pid; infop->uid = uid; } - put_task_struct(p); - - BUG_ON(!pid); return pid; } @@ -1324,7 +1309,7 @@ unlock_sig: */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { - int retval; + struct waitid_info *infop; pid_t pid; uid_t uid; @@ -1351,18 +1336,18 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if (!wo->wo_info) { - put_task_struct(p); + infop = wo->wo_info; + if (!infop) { wo->wo_stat = 0xffff; - retval = pid; } else { - retval = wait_noreap_copyout(wo, p, pid, uid, - CLD_CONTINUED, SIGCONT); - BUG_ON(retval == 0); + infop->cause = CLD_CONTINUED; + infop->pid = pid; + infop->uid = uid; + infop->status = SIGCONT; } - - return retval; + return pid; } /* -- cgit v1.2.3 From 76d9871e1122aabc086e7aade5251b1e5124cbb9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:38:26 -0400 Subject: wait_task_zombie: consolidate info logics Signed-off-by: Al Viro --- kernel/exit.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f01ebaab978a..97db9ee03f90 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1074,28 +1074,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - int exit_code = p->exit_code; - + status = p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); - - infop = wo->wo_info; - if (infop) { - if ((exit_code & 0x7f) == 0) { - infop->cause = CLD_EXITED; - infop->status = exit_code >> 8; - } else { - infop->cause = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - infop->status = exit_code & 0x7f; - } - infop->pid = pid; - infop->uid = uid; - } - return pid; + goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. @@ -1174,19 +1160,6 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; - infop = wo->wo_info; - if (infop) { - if ((status & 0x7f) == 0) { - infop->cause = CLD_EXITED; - infop->status = status >> 8; - } else { - infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - infop->status = status & 0x7f; - } - infop->pid = pid; - infop->uid = uid; - } - if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ @@ -1202,6 +1175,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (state == EXIT_DEAD) release_task(p); +out_info: + infop = wo->wo_info; + if (infop) { + if ((status & 0x7f) == 0) { + infop->cause = CLD_EXITED; + infop->status = status >> 8; + } else { + infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = status & 0x7f; + } + infop->pid = pid; + infop->uid = uid; + } + return pid; } -- cgit v1.2.3 From 4c48abe91be03d191d0c20cc755877da2cb35622 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 19:27:32 -0400 Subject: waitid(): switch copyout of siginfo to unsafe_put_user() Signed-off-by: Al Viro --- kernel/exit.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 97db9ee03f90..f3b8c3a87bc1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1625,15 +1625,18 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || - put_user(0, &infop->si_errno) || - put_user((short)info.cause, &infop->si_code) || - put_user(info.pid, &infop->si_pid) || - put_user(info.uid, &infop->si_uid) || - put_user(info.status, &infop->si_status)) - err = -EFAULT; - + user_access_begin(); + unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.pid, &infop->si_pid, Efault); + unsafe_put_user(info.uid, &infop->si_uid, Efault); + unsafe_put_user(info.status, &infop->si_status, Efault); + user_access_end(); return err; +Efault: + user_access_end(); + return -EFAULT; } static long kernel_wait4(pid_t upid, int __user *stat_addr, @@ -1736,13 +1739,20 @@ COMPAT_SYSCALL_DEFINE5(waitid, return -EFAULT; } - if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || - put_user(0, &infop->si_errno) || - put_user((short)info.cause, &infop->si_code) || - put_user(info.pid, &infop->si_pid) || - put_user(info.uid, &infop->si_uid) || - put_user(info.status, &infop->si_status)) - err = -EFAULT; + if (!infop) + return err; + + user_access_begin(); + unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.pid, &infop->si_pid, Efault); + unsafe_put_user(info.uid, &infop->si_uid, Efault); + unsafe_put_user(info.status, &infop->si_status, Efault); + user_access_end(); return err; +Efault: + user_access_end(); + return -EFAULT; } #endif -- cgit v1.2.3 From 92ebce5ac55dba258c608248dddf59eca3f7f514 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 23:54:33 -0400 Subject: osf_wait4: switch to kernel_wait4() ... and sanitize copying rusage to userland Signed-off-by: Al Viro --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f3b8c3a87bc1..462fc25eec6e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1639,8 +1639,8 @@ Efault: return -EFAULT; } -static long kernel_wait4(pid_t upid, int __user *stat_addr, - int options, struct rusage *ru) +long kernel_wait4(pid_t upid, int __user *stat_addr, int options, + struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; -- cgit v1.2.3 From e4eda884db7930cee434828759064b4711604078 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 22 May 2017 12:27:07 -0400 Subject: net: Make IP alignment calulations clearer. The assignmnet: ip_align = strict ? 2 : NET_IP_ALIGN; in compare_pkt_ptr_alignment() trips up Coverity because we can only get to this code when strict is true, therefore ip_align will always be 2 regardless of NET_IP_ALIGN's value. So just assign directly to '2' and explain the situation in the comment above. Reported-by: "Gustavo A. R. Silva" Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1eddb713b815..c72cd41f5b8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -808,11 +808,15 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, reg_off += reg->aux_off; } - /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking - * we force this to 2 which is universally what architectures use - * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + /* For platforms that do not have a Kconfig enabling + * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of + * NET_IP_ALIGN is universally set to '2'. And on platforms + * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get + * to this code only in strict mode where we want to emulate + * the NET_IP_ALIGN==2 checking. Therefore use an + * unconditional IP align value of '2'. */ - ip_align = strict ? 2 : NET_IP_ALIGN; + ip_align = 2; if ((ip_align + reg_off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", ip_align, reg_off, off, size); -- cgit v1.2.3 From 04dc1b2fff4e96cb4142227fbdc63c8871ad4ed9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2017 17:48:50 +0200 Subject: futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock() Markus reported that the glibc/nptl/tst-robustpi8 test was failing after commit: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") The following trace shows the problem: ld-linux-x86-64-2161 [019] .... 410.760971: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_LOCK_PI ld-linux-x86-64-2161 [019] ...1 410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0 ld-linux-x86-64-2165 [011] .... 410.760978: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_UNLOCK_PI ld-linux-x86-64-2165 [011] d..1 410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0 ld-linux-x86-64-2165 [011] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000 ld-linux-x86-64-2161 [019] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161 which then returns with -ETIMEDOUT. That wrecks the lock state, because now the owner isn't aware it acquired the lock and removes the pending robust list entry. If 2161 is killed, the robust list will not clear out this futex and the subsequent acquire on this futex will then (correctly) result in -ESRCH which is unexpected by glibc, triggers an internal assertion and dies. Task 2161 Task 2165 rt_mutex_wait_proxy_lock() timeout(); /* T2161 is still queued in the waiter list */ return -ETIMEDOUT; futex_unlock_pi() spin_lock(hb->lock); rtmutex_unlock() remove_rtmutex_waiter(T2161); mark_lock_available(); /* Make the next waiter owner of the user space side */ futex_uval = 2161; spin_unlock(hb->lock); spin_lock(hb->lock); rt_mutex_cleanup_proxy_lock() if (rtmutex_owner() !== current) ... return FAIL; .... return -ETIMEOUT; This means that rt_mutex_cleanup_proxy_lock() needs to call try_to_take_rt_mutex() so it can take over the rtmutex correctly which was assigned by the waker. If the rtmutex is owned by some other task then this call is harmless and just confirmes that the waiter is not able to acquire it. While there, fix what looks like a merge error which resulted in rt_mutex_cleanup_proxy_lock() having two calls to fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any. Both should have one, since both potentially touch the waiter list. Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()") Reported-by: Markus Trippelsdorf Bug-Spotted-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Florian Weimer Cc: Darren Hart Cc: Sebastian Andrzej Siewior Cc: Markus Trippelsdorf Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- kernel/locking/rtmutex.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b95509416909..28cd09e635ed 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1785,12 +1785,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, int ret; raw_spin_lock_irq(&lock->wait_lock); - - set_current_state(TASK_INTERRUPTIBLE); - /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1821,16 +1823,26 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, bool cleanup = false; raw_spin_lock_irq(&lock->wait_lock); + /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); /* * Unless we're the owner; we're still enqueued on the wait_list. * So check if we became owner, if not, take us off the wait_list. */ if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); - fixup_rt_mutex_waiters(lock); cleanup = true; } - /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. -- cgit v1.2.3 From 6f9a22bc5775d231ab8fbe2c2f3c88e45e3e7c28 Mon Sep 17 00:00:00 2001 From: Michael Hernandez Date: Thu, 18 May 2017 10:47:47 -0700 Subject: PCI/MSI: Ignore affinity if pre/post vector count is more than min_vecs min_vecs is the minimum amount of vectors needed to operate in MSI-X mode which may just include the vectors that don't need affinity. Disabling affinity settings causes the qla2xxx driver scsi_add_host() to fail when blk_mq is enabled as the blk_mq_pci_map_queues() expects affinity masks on each vector. Fixes: dfef358bd1be ("PCI/MSI: Don't apply affinity if there aren't enough vectors left") Signed-off-by: Michael Hernandez Signed-off-by: Himanshu Madhani Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org # v4.10+ --- kernel/irq/affinity.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e2d356dd7581..9b71406d2eec 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -66,6 +66,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) struct cpumask *masks; cpumask_var_t nmsk; + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (!affv) + return NULL; + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -140,15 +147,19 @@ out: /** * irq_calc_affinity_vectors - Calculate the optimal number of vectors + * @minvec: The minimum number of vectors available * @maxvec: The maximum number of vectors available * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) +int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) { int resv = affd->pre_vectors + affd->post_vectors; int vecs = maxvec - resv; int cpus; + if (resv > minvec) + return 0; + /* Stabilize the cpumasks */ get_online_cpus(); cpus = cpumask_weight(cpu_online_mask); -- cgit v1.2.3 From 4d6501dce079c1eb6bf0b1d8f528a5e81770109e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 9 May 2017 09:39:59 +0200 Subject: kthread: Fix use-after-free if kthread fork fails If a kthread forks (e.g. usermodehelper since commit 1da5c46fa965) but fails in copy_process() between calling dup_task_struct() and setting p->set_child_tid, then the value of p->set_child_tid will be inherited from the parent and get prematurely freed by free_kthread_struct(). kthread() - worker_thread() - process_one_work() | - call_usermodehelper_exec_work() | - kernel_thread() | - _do_fork() | - copy_process() | - dup_task_struct() | - arch_dup_task_struct() | - tsk->set_child_tid = current->set_child_tid // implied | - ... | - goto bad_fork_* | - ... | - free_task(tsk) | - free_kthread_struct(tsk) | - kfree(tsk->set_child_tid) - ... - schedule() - __schedule() - wq_worker_sleeping() - kthread_data(task)->flags // UAF The problem started showing up with commit 1da5c46fa965 since it reused ->set_child_tid for the kthread worker data. A better long-term solution might be to get rid of the ->set_child_tid abuse. The comment in set_kthread_struct() also looks slightly wrong. Debugged-by: Jamie Iles Fixes: 1da5c46fa965 ("kthread: Make struct kthread kmalloc'ed") Signed-off-by: Vegard Nossum Acked-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: Jamie Iles Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170509073959.17858-1-vegard.nossum@oracle.com Signed-off-by: Thomas Gleixner --- kernel/fork.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d681f8f10d2d..b7cdea10239c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1553,6 +1553,18 @@ static __latent_entropy struct task_struct *copy_process( if (!p) goto fork_out; + /* + * This _must_ happen before we call free_task(), i.e. before we jump + * to any of the bad_fork_* labels. This is to avoid freeing + * p->set_child_tid which is (ab)used as a kthread's data pointer for + * kernel threads (PF_KTHREAD). + */ + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + ftrace_graph_init_task(p); rt_mutex_init_task(p); @@ -1716,11 +1728,6 @@ static __latent_entropy struct task_struct *copy_process( } } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; - /* - * Clear TID on mm_release()? - */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif -- cgit v1.2.3 From fe17a42e704a64477b15bb2cf8366fe3e5119aff Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 May 2017 12:55:35 +0100 Subject: irqdomain: Let irq_domain_mapping display hierarchical domains Hierarchical domains seem to be hard to grasp, and a number of aspiring kernel hackers find them utterly discombobulating. In order to ease their pain, let's make them appear in /sys/kernel/debug/irq_domain_mapping, such as the following: 96 0x81808 MSI 0x (null) RADIX MSI 96+ 0x00063 GICv2m 0xffff8003ee116980 RADIX GICv2m 96+ 0x00063 GICv2 0xffff00000916bfd8 LINEAR GICv2 [output compressed to fit in a commit log] This shows that IRQ96 is implemented by a stack of three domains, the + sign indicating the stacking. Signed-off-by: Marc Zyngier Link: http://lkml.kernel.org/r/20170512115538.10767-2-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 68 +++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 31805f237396..1f6cd2cacf74 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -746,13 +746,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain, EXPORT_SYMBOL_GPL(irq_find_mapping); #ifdef CONFIG_IRQ_DOMAIN_DEBUG +static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) +{ + struct irq_domain *domain; + struct irq_data *data; + + domain = desc->irq_data.domain; + data = &desc->irq_data; + + while (domain) { + unsigned int irq = data->irq; + unsigned long hwirq = data->hwirq; + struct irq_chip *chip; + bool direct; + + if (data == &desc->irq_data) + seq_printf(m, "%5d ", irq); + else + seq_printf(m, "%5d+ ", irq); + seq_printf(m, "0x%05lx ", hwirq); + + chip = irq_data_get_irq_chip(data); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); + + seq_printf(m, data ? "0x%p " : " %p ", + irq_data_get_irq_chip_data(data)); + + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); + seq_printf(m, "%s\n", domain->name); +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + domain = domain->parent; + data = data->parent_data; +#else + domain = NULL; +#endif + } +} + static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void *data, **slot; + void **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -782,30 +823,7 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); - domain = desc->irq_data.domain; - - if (domain) { - struct irq_chip *chip; - int hwirq = desc->irq_data.hwirq; - bool direct; - - seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05x ", hwirq); - - chip = irq_desc_get_chip(desc); - seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - - data = irq_desc_get_chip_data(desc); - seq_printf(m, data ? "0x%p " : " %p ", data); - - seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); - direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); - seq_printf(m, "%6s%-8s ", - (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", - direct ? "(DIRECT)" : ""); - seq_printf(m, "%s\n", desc->irq_data.domain->name); - } - + virq_debug_show_one(m, desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } -- cgit v1.2.3 From 2370c00dc7232d0c4af224e7730b4de031f3b1a0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 May 2017 12:55:36 +0100 Subject: irqdomain: Let irq_domain_mapping display ACPI fwnode attributes If the system is using ACPI, there is no of_node to display. But ACPI can use a struct irqchip_fwid as a domain identifier, and it can be used to display the name contained in that structure. The output on such a system will look like this: pMSI 0 0 0 irqchip@00000000e1180000 MSI 37 0 0 irqchip@00000000e1180000 GICv2m 37 0 0 irqchip@00000000e1180000 GICv2 448 448 0 irqchip@ffff000008003000 Signed-off-by: Marc Zyngier Link: http://lkml.kernel.org/r/20170512115538.10767-3-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1f6cd2cacf74..70b9da72018b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -801,15 +801,26 @@ static int virq_debug_show(struct seq_file *m, void *private) mutex_lock(&irq_domain_mutex); list_for_each_entry(domain, &irq_domain_list, link) { struct device_node *of_node; + const char *name; + int count = 0; + of_node = irq_domain_get_of_node(domain); + if (of_node) + name = of_node_full_name(of_node); + else if (is_fwnode_irqchip(domain->fwnode)) + name = container_of(domain->fwnode, struct irqchip_fwid, + fwnode)->name; + else + name = ""; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) count++; seq_printf(m, "%c%-16s %6u %10u %10u %s\n", domain == irq_default_domain ? '*' : ' ', domain->name, domain->revmap_size + count, domain->revmap_size, domain->revmap_direct_max_irq, - of_node ? of_node_full_name(of_node) : ""); + name); } mutex_unlock(&irq_domain_mutex); -- cgit v1.2.3 From a97b852b4d4c2f8c50cab13c71566639f9a1a990 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 May 2017 12:55:37 +0100 Subject: genirq/msi: Populate the domain name if provided by the irqchip In order to ease debug, let's populate the domain name upfront, before any MSI gets requested. This allows the domain to appear in the irq_domain_mapping, and the user to easily find the expected data. Signed-off-by: Marc Zyngier Link: http://lkml.kernel.org/r/20170512115538.10767-4-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ddc2f5427f75..fe4d48ec5bc4 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -265,13 +265,19 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { + struct irq_domain *domain; + if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) msi_domain_update_dom_ops(info); if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); - return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, - fwnode, &msi_domain_ops, info); + domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, + fwnode, &msi_domain_ops, info); + if (domain && info->chip && info->chip->name) + domain->name = info->chip->name; + + return domain; } int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, -- cgit v1.2.3 From 85c617abc786d7da9e95c0b4174159864dd3f85c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 22 May 2017 12:03:49 +0300 Subject: perf/core: Remove some dead code perf_init_event() can't return NULL. If it did, the error handling is incomplete and we would crash. I have removed this confusing dead code. Signed-off-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20170522090348.5g7yyld5en3yeky4@mwanda Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..0028efa0abc3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9172,7 +9172,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { - struct pmu *pmu = NULL; + struct pmu *pmu; int idx; int ret; @@ -9456,9 +9456,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } pmu = perf_init_event(event); - if (!pmu) - goto err_ns; - else if (IS_ERR(pmu)) { + if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } -- cgit v1.2.3 From 36cc2b9222b5106de34085c4dd8635ac67ef5cba Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 22 May 2017 12:04:18 +0300 Subject: perf/core: Fix error handling in perf_event_alloc() We don't set an error code here which means that perf_event_alloc() returns ERR_PTR(0) (in other words NULL). The callers are not expecting that and would Oops. Signed-off-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 375637bc5249 ("perf/core: Introduce address range filtering") Link: http://lkml.kernel.org/r/20170522090418.hvs6icgpdo53wkn5@mwanda Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0028efa0abc3..f8c27d3ef3a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9469,8 +9469,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, sizeof(unsigned long), GFP_KERNEL); - if (!event->addr_filters_offs) + if (!event->addr_filters_offs) { + err = -ENOMEM; goto err_per_task; + } /* force hw sync on the address filters */ event->addr_filters_gen = 1; -- cgit v1.2.3 From 3fc5b3b6a80b2e08a0fec0056208c5dff757e547 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 19 May 2017 15:53:31 +0800 Subject: smp: Avoid sending needless IPI in smp_call_function_many() Inter-Processor-Interrupt(IPI) is needed when a page is unmapped and the process' mm_cpumask() shows the process has ever run on other CPUs. page migration, page reclaim all need IPIs. The number of IPI needed to send to different CPUs is especially large for multi-threaded workload since mm_cpumask() is per process. For smp_call_function_many(), whenever a CPU queues a CSD to a target CPU, it will send an IPI to let the target CPU to handle the work. This isn't necessary - we need only send IPI when queueing a CSD to an empty call_single_queue. The reason: flush_smp_call_function_queue() that is called upon a CPU receiving an IPI will empty the queue and then handle all of the CSDs there. So if the target CPU's call_single_queue is not empty, we know that: i. An IPI for the target CPU has already been sent by 'previous queuers'; ii. flush_smp_call_function_queue() hasn't emptied that CPU's queue yet. Thus, it's safe for us to just queue our CSD there without sending an addtional IPI. And for the 'previous queuers', we can limit it to the first queuer. To demonstrate the effect of this patch, a multi-thread workload that spawns 80 threads to equally consume 100G memory is used. This is tested on a 2 node broadwell-EP which has 44cores/88threads and 32G memory. So after 32G memory is used up, page reclaiming starts to happen a lot. With this patch, IPI number dropped 88% and throughput increased about 15% for the above workload. Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Cc: Dave Hansen Cc: Huang Ying Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20170519075331.GE2084@aaronlu.sh.intel.com Signed-off-by: Ingo Molnar --- kernel/smp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index a817769b53c0..76d16fe3c427 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -30,6 +30,7 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; + cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; + if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); + return -ENOMEM; + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } @@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu) struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } @@ -434,6 +442,7 @@ void smp_call_function_many(const struct cpumask *mask, if (unlikely(!cpumask_weight(cfd->cpumask))) return; + cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); @@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask, csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->func = func; csd->info = info; - llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) + cpumask_set_cpu(cpu, cfd->cpumask_ipi); } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { for_each_cpu(cpu, cfd->cpumask) { -- cgit v1.2.3 From 6c8557bdb28df3ae97476c5e2aed6373cd235aab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2017 12:58:25 +0200 Subject: smp, cpumask: Use non-atomic cpumask_{set,clear}_cpu() The cpumasks in smp_call_function_many() are private and not subject to concurrency, atomic bitops are pointless and expensive. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 76d16fe3c427..3061483cb3ad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -436,7 +436,7 @@ void smp_call_function_many(const struct cpumask *mask, cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, cfd->cpumask); + __cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ if (unlikely(!cpumask_weight(cfd->cpumask))) @@ -452,7 +452,7 @@ void smp_call_function_many(const struct cpumask *mask, csd->func = func; csd->info = info; if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - cpumask_set_cpu(cpu, cfd->cpumask_ipi); + __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } /* Send a message to all CPUs in the map */ -- cgit v1.2.3 From 73215849dfbf63421c1cafea5a1b6da9bb17831e Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 12 May 2017 09:39:44 +0900 Subject: sched/core: Use the new llist_for_each_entry_safe() primitive Now that we've added llist_for_each_entry_safe(), use it to simplify an open coded version of it in sched_ttwu_pending(). Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1494549584-11730-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dde5d1e860f0..4a31239956da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1731,7 +1731,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct task_struct *p; + struct task_struct *p, *t; struct rq_flags rf; if (!llist) @@ -1740,17 +1740,8 @@ void sched_ttwu_pending(void) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - while (llist) { - int wake_flags = 0; - - p = llist_entry(llist, struct task_struct, wake_entry); - llist = llist_next(llist); - - if (p->sched_remote_wakeup) - wake_flags = WF_MIGRATED; - - ttwu_do_activate(rq, p, wake_flags, &rf); - } + llist_for_each_entry_safe(p, t, llist, wake_entry) + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); rq_unlock_irqrestore(rq, &rf); } -- cgit v1.2.3 From de16b91effdbf5aeff8346b99bcd0991a5362db9 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 12 May 2017 10:05:43 +0900 Subject: sched/rt: Remove unnecessary condition in push_rt_task() pick_next_pushable_task(rq) has BUG_ON(rq_cpu != task_cpu(task)) when it returns a task other than NULL, which means that task_cpu(task) must be rq->cpu. So if task == next_task, then task_cpu(next_task) must be rq->cpu as well. Remove the redundant condition and make the code simpler. This way one unnecessary branch and two LOAD operations can be avoided. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Juri Lelli Reviewed-by: Daniel Bristot de Oliveira Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1494551143-22219-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 979b7341008a..c18b50094fab 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1819,7 +1819,7 @@ retry: * pushing. */ task = pick_next_pushable_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue -- cgit v1.2.3 From a776b968e52895a350d636e6e7fdcb3b10846fa4 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 12 May 2017 10:05:59 +0900 Subject: sched/deadline: Remove unnecessary condition in push_dl_task() pick_next_pushable_dl_task(rq) has BUG_ON(rq->cpu != task_cpu(task)) when it returns a task other than NULL, which means that task_cpu(task) must be rq->cpu. So if task == next_task, then task_cpu(next_task) must be rq->cpu as well. Remove the redundant condition and make the code simpler. This way one unnecessary branch and two LOAD operations can be avoided. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Juri Lelli Reviewed-by: Daniel Bristot de Oliveira Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1494551159-22367-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2ce59015642..df6c2912bd60 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1533,7 +1533,7 @@ retry: * then possible that next_task has migrated. */ task = pick_next_pushable_dl_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task is still there. We don't try * again, some other cpu will pull it when ready. -- cgit v1.2.3 From 896bbb2522587e3b8eb2a0d204d43ccc1042a00d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 9 Mar 2017 10:18:42 -0500 Subject: sched/core: Allow __sched_setscheduler() in interrupts when PI is not used When priority inheritance was added back in 2.6.18 to sched_setscheduler(), it added a path to taking an rt-mutex wait_lock, which is not IRQ safe. As PI is not a common occurrence, lockdep will likely never trigger if sched_setscheduler was called from interrupt context. A BUG_ON() was added to trigger if __sched_setscheduler() was ever called from interrupt context because there was a possibility to take the wait_lock. Today the wait_lock is irq safe, but the path to taking it in sched_setscheduler() is the same as the path to taking it from normal context. The wait_lock is taken with raw_spin_lock_irq() and released with raw_spin_unlock_irq() which will indiscriminately enable interrupts, which would be bad in interrupt context. The problem is that normalize_rt_tasks, which is called by triggering the sysrq nice-all-RT-tasks was changed to call __sched_setscheduler(), and this is done from interrupt context! Now __sched_setscheduler() takes a "pi" parameter that is used to know if the priority inheritance should be called or not. As the BUG_ON() only cares about calling the PI code, it should only bug if called from interrupt context with the "pi" parameter set to true. Reported-by: Laurent Dufour Tested-by: Laurent Dufour Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: dbc7f069b93a ("sched: Use replace normalize_task() with __sched_setscheduler()") Link: http://lkml.kernel.org/r/20170308124654.10e598f2@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a31239956da..877241e9f2b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4188,8 +4188,8 @@ static int __sched_setscheduler(struct task_struct *p, int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; - /* May grab non-irq protected spin_locks: */ - BUG_ON(in_interrupt()); + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); recheck: /* Double check policy once rq lock held: */ if (policy < 0) { -- cgit v1.2.3 From c249f255aab86b9b187ba319b9d2684841ac7c8d Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 15 May 2017 14:14:13 -0500 Subject: sched/rt: Minimize rq->lock contention in do_sched_rt_period_timer() With CONFIG_RT_GROUP_SCHED=y, do_sched_rt_period_timer() sequentially takes each CPU's rq->lock. On a large, busy system, the cumulative time it takes to acquire each lock can be excessive, even triggering a watchdog timeout. If rt_rq->rt_time and rt_rq->rt_nr_running are both zero, this function does nothing while holding the lock, so don't bother taking it at all. Signed-off-by: Dave Kleikamp Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a767637b-df85-912f-ba69-c90ee00a3fb6@oracle.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c18b50094fab..581d5c7a5264 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + int skip; + + /* + * When span == cpu_online_mask, taking each rq->lock + * can be time-consuming. Try to avoid it when possible. + */ + raw_spin_lock(&rt_rq->rt_runtime_lock); + skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (skip) + continue; raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { -- cgit v1.2.3 From 8655d5497735b288f8a9b458bd22e7d1bf95bb61 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 May 2017 15:13:16 +0200 Subject: sched/numa: Use down_read_trylock() for the mmap_sem A customer has reported a soft-lockup when running an intensive memory stress test, where the trace on multiple CPU's looks like this: RIP: 0010:[] [] native_queued_spin_lock_slowpath+0x10e/0x190 ... Call Trace: [] queued_spin_lock_slowpath+0x7/0xa [] change_protection_range+0x3b1/0x930 [] change_prot_numa+0x18/0x30 [] task_numa_work+0x1fe/0x310 [] task_work_run+0x72/0x90 Further investigation showed that the lock contention here is pmd_lock(). The task_numa_work() function makes sure that only one thread is let to perform the work in a single scan period (via cmpxchg), but if there's a thread with mmap_sem locked for writing for several periods, multiple threads in task_numa_work() can build up a convoy waiting for mmap_sem for read and then all get unblocked at once. This patch changes the down_read() to the trylock version, which prevents the build up. For a workload experiencing mmap_sem contention, it's probably better to postpone the NUMA balancing work anyway. This seems to have fixed the soft lockups involving pmd_lock(), which is in line with the convoy theory. Signed-off-by: Vlastimil Babka Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170515131316.21909-1-vbabka@suse.cz Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 219fe58e3023..47a0c552c77b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2470,7 +2470,8 @@ void task_numa_work(struct callback_head *work) return; - down_read(&mm->mmap_sem); + if (!down_read_trylock(&mm->mmap_sem)) + return; vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); -- cgit v1.2.3 From b4def42724594cd399cfee365221f5b38639711d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 May 2017 20:42:43 +0200 Subject: async: Adjust system_state checks To enable smp_processor_id() and might_sleep() debug checks earlier, it's required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING. Adjust the system_state check in async_run_entry_fn() and async_synchronize_cookie_domain() to handle the extra states. Tested-by: Mark Rutland Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170516184735.865155020@linutronix.de Signed-off-by: Ingo Molnar --- kernel/async.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index d2edd6efec56..2cbd3dd5940d 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work) ktime_t uninitialized_var(calltime), delta, rettime; /* 1) run (and print duration) */ - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", @@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain { ktime_t uninitialized_var(starttime), delta, endtime; - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { endtime = ktime_get(); delta = ktime_sub(endtime, starttime); -- cgit v1.2.3 From 0594729c24d846889408a07057b5cc9e8d931419 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 May 2017 20:42:44 +0200 Subject: extable: Adjust system_state checks To enable smp_processor_id() and might_sleep() debug checks earlier, it's required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING. Adjust the system_state check in core_kernel_text() to handle the extra states, i.e. to cover init text up to the point where the system switches to state RUNNING. Tested-by: Mark Rutland Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170516184735.949992741@linutronix.de Signed-off-by: Ingo Molnar --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 2676d7f8baf6..0fbdd8582f08 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr) addr < (unsigned long)_etext) return 1; - if (system_state == SYSTEM_BOOTING && + if (system_state < SYSTEM_RUNNING && init_kernel_text(addr)) return 1; return 0; -- cgit v1.2.3 From ff48cd26fc4889b9deb5f9333d3c61746e450b7f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 May 2017 20:42:45 +0200 Subject: printk: Adjust system_state checks To enable smp_processor_id() and might_sleep() debug checks earlier, it's required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING. Adjust the system_state check in boot_delay_msec() to handle the extra states. Tested-by: Mark Rutland Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170516184736.027534895@linutronix.de Signed-off-by: Ingo Molnar --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1aecf44ab07..32fac391ac2a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1176,7 +1176,7 @@ static void boot_delay_msec(int level) unsigned long long k; unsigned long timeout; - if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress_message_printing(level)) { return; } -- cgit v1.2.3 From 1c3c5eab171590f86edd8d31389d61dd1efe3037 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 May 2017 20:42:48 +0200 Subject: sched/core: Enable might_sleep() and smp_processor_id() checks early might_sleep() and smp_processor_id() checks are enabled after the boot process is done. That hides bugs in the SMP bringup and driver initialization code. Enable it right when the scheduler starts working, i.e. when init task and kthreadd have been created and right before the idle task enables preemption. Tested-by: Mark Rutland Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170516184736.272225698@linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 877241e9f2b0..c3e50cada84d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6238,8 +6238,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || + oops_in_progress) return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; -- cgit v1.2.3 From c70d9d809fdeecedb96972457ee45c49a232d97f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 May 2017 15:40:12 -0500 Subject: ptrace: Properly initialize ptracer_cred on fork When I introduced ptracer_cred I failed to consider the weirdness of fork where the task_struct copies the old value by default. This winds up leaving ptracer_cred set even when a process forks and the child process does not wind up being ptraced. Because ptracer_cred is not set on non-ptraced processes whose parents were ptraced this has broken the ability of the enlightenment window manager to start setuid children. Fix this by properly initializing ptracer_cred in ptrace_init_task This must be done with a little bit of care to preserve the current value of ptracer_cred when ptrace carries through fork. Re-reading the ptracer_cred from the ptracing process at this point is inconsistent with how PT_PTRACE_CAP has been maintained all of these years. Tested-by: Takashi Iwai Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP") Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 266ddcc1d8bb..60f356d91060 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, } +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -459,7 +465,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); -- cgit v1.2.3 From 4b3e4ed6b0d958d7fb2f160bb8ebfb4f0db19382 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 20 Apr 2017 13:07:30 -0400 Subject: audit: unswing cap_* fields in PATH records The cap_* fields swing in and out of PATH records. If no capabilities are set, the cap_* fields are completely missing and when one of the cap_fi or cap_fp values is empty, that field is omitted. Original: type=PATH msg=audit(04/20/2017 12:17:11.222:193) : item=1 name=/lib64/ld-linux-x86-64.so.2 inode=787694 dev=08:03 mode=file,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL type=PATH msg=audit(04/20/2017 12:17:11.222:193) : item=0 name=/home/sleep inode=1319469 dev=08:03 mode=file,suid,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL cap_fp=sys_admin cap_fe=1 cap_fver=2 Normalize the PATH record by always printing all 4 cap_* fields. Fixed: type=PATH msg=audit(04/20/2017 13:01:31.679:201) : item=1 name=/lib64/ld-linux-x86-64.so.2 inode=787694 dev=08:03 mode=file,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL cap_fp=none cap_fi=none cap_fe=0 cap_fver=0 type=PATH msg=audit(04/20/2017 13:01:31.679:201) : item=0 name=/home/sleep inode=1319469 dev=08:03 mode=file,suid,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL cap_fp=sys_admin cap_fi=none cap_fe=1 cap_fver=2 See: https://github.com/linux-audit/audit-kernel/issues/42 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a7c6a50477aa..b2e877100242 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1999,22 +1999,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { - kernel_cap_t *perm = &name->fcap.permitted; - kernel_cap_t *inh = &name->fcap.inheritable; - int log = 0; - - if (!cap_isclear(*perm)) { - audit_log_cap(ab, "cap_fp", perm); - log = 1; - } - if (!cap_isclear(*inh)) { - audit_log_cap(ab, "cap_fi", inh); - log = 1; - } - - if (log) - audit_log_format(ab, " cap_fe=%d cap_fver=%x", - name->fcap.fE, name->fcap_ver); + audit_log_cap(ab, "cap_fp", &name->fcap.permitted); + audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); + audit_log_format(ab, " cap_fe=%d cap_fver=%x", + name->fcap.fE, name->fcap_ver); } static inline int audit_copy_fcaps(struct audit_names *name, -- cgit v1.2.3 From 490194269665d6d4915a4a5774f002885c5a2d8f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Apr 2017 15:35:26 -0700 Subject: module: Pass struct load_info into symbol checks Since we're already using values from struct load_info, just pass this pointer in directly and use what's needed as we need it. This allows us to access future fields in struct load_info too. Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- kernel/module.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..ca4509b13400 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1278,12 +1278,13 @@ static u32 resolve_rel_crc(const s32 *crc) return *(u32 *)((void *)crc + *crc); } -static int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) { + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; unsigned int i, num_versions; struct modversion_info *versions; @@ -1326,8 +1327,7 @@ bad_version: return 0; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { const s32 *crc; @@ -1343,8 +1343,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, BUG(); } preempt_enable(); - return check_version(sechdrs, versindex, - VMLINUX_SYMBOL_STR(module_layout), mod, crc); + return check_version(info, VMLINUX_SYMBOL_STR(module_layout), + mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -1358,8 +1358,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, return strcmp(amagic, bmagic) == 0; } #else -static inline int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) @@ -1367,8 +1366,7 @@ static inline int check_version(Elf_Shdr *sechdrs, return 1; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { return 1; @@ -1404,7 +1402,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (!sym) goto unlock; - if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -2971,7 +2969,7 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + if (!check_modstruct_version(info, mod)) return ERR_PTR(-ENOEXEC); return mod; -- cgit v1.2.3 From 3e2e857f9c3a19d55ee0ba7b428b8be5008960bf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Apr 2017 15:35:27 -0700 Subject: module: Add module name to modinfo Accessing the mod structure (e.g. for mod->name) prior to having completed check_modstruct_version() can result in writing garbage to the error logs if the layout of the mod structure loaded from disk doesn't match the running kernel's mod structure layout. This kind of mismatch will become much more likely if a kernel is built with different randomization seed for the struct layout randomization plugin. Instead, add and use a new modinfo string for logging the module name. Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- kernel/module.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ca4509b13400..3803449ca219 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -302,6 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { + char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -1318,12 +1319,12 @@ static int check_version(const struct load_info *info, } /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); return 1; bad_version: pr_warn("%s: disagrees about version of symbol %s\n", - mod->name, symname); + info->name, symname); return 0; } @@ -2913,9 +2914,15 @@ static int rewrite_section_headers(struct load_info *info, int flags) info->index.vers = 0; /* Pretend no __versions section! */ else info->index.vers = find_sec(info, "__versions"); + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.info = find_sec(info, ".modinfo"); + if (!info->index.info) + info->name = "(missing .modinfo section)"; + else + info->name = get_modinfo(info, "name"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; - info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; } @@ -2955,14 +2962,22 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { - pr_warn("No module found in object\n"); + pr_warn("%s: No module found in object\n", + info->name ?: "(missing .modinfo name field)"); return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; + /* + * If we didn't load the .modinfo 'name' field, fall back to + * on-disk struct mod 'name' field. + */ + if (!info->name) + info->name = mod->name; + if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", mod->name); + pr_warn("%s: module has no symbols (stripped?)\n", info->name); return ERR_PTR(-ENOEXEC); } @@ -2990,7 +3005,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return err; } else if (!same_magic(modmagic, vermagic, info->index.vers)) { pr_err("%s: version magic '%s' should be '%s'\n", - mod->name, modmagic, vermagic); + info->name, modmagic, vermagic); return -ENOEXEC; } @@ -3270,7 +3285,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (IS_ERR(mod)) return mod; - if (blacklisted(mod->name)) + if (blacklisted(info->name)) return ERR_PTR(-EPERM); err = check_modinfo(mod, info, flags); -- cgit v1.2.3 From 43fe8b8eb81eee713400340716cf945f59d21496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 May 2017 23:27:38 +0200 Subject: posix-timers: Make signal printks conditional A recent commit added extra printks for CPU/RT limits. This can result in excessive spam in dmesg. Make the printks conditional on print_fatal_signals. Fixes: e7ea7c9806a2 ("rlimits: Print more information when CPU/RT limits are exceeded") Reported-by: Dave Jones Signed-off-by: Thomas Gleixner Cc: Arun Raghavan --- kernel/time/posix-cpu-timers.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1370f067fb51..d2a1e6dd0291 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -825,8 +825,10 @@ static void check_thread_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ - pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -838,8 +840,10 @@ static void check_thread_timers(struct task_struct *tsk, soft += USEC_PER_SEC; sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } - pr_info("RT Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } @@ -936,8 +940,10 @@ static void check_process_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ - pr_info("RT Watchdog Timeout (hard): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("RT Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -945,8 +951,10 @@ static void check_process_timers(struct task_struct *tsk, /* * At the soft limit, send a SIGXCPU every second. */ - pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", - tsk->comm, task_pid_nr(tsk)); + if (print_fatal_signals) { + pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + } __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); if (soft < hard) { soft++; -- cgit v1.2.3 From 45aea321678856687927c53972321ebfab77759a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 May 2017 08:52:02 +0200 Subject: sched/clock: Fix early boot preempt assumption in __set_sched_clock_stable() The more strict early boot preemption warnings found that __set_sched_clock_stable() was incorrectly assuming we'd still be running on a single CPU: BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1 caller is debug_smp_processor_id+0x1c/0x1e CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc2-00108-g1c3c5ea #1 Call Trace: dump_stack+0x110/0x192 check_preemption_disabled+0x10c/0x128 ? set_debug_rodata+0x25/0x25 debug_smp_processor_id+0x1c/0x1e sched_clock_init_late+0x27/0x87 [...] Fix it by disabling IRQs. Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Steven Rostedt Cc: lkp@01.org Cc: tipbuild@zytor.com Link: http://lkml.kernel.org/r/20170524065202.v25vyu7pvba5mhpd@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 1a0d389d2f2b..ca0f8fc945c6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -133,12 +133,19 @@ static void __scd_stamp(struct sched_clock_data *scd) static void __set_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); + struct sched_clock_data *scd; + /* + * Since we're still unstable and the tick is already running, we have + * to disable IRQs in order to get a consistent scd->tick* reading. + */ + local_irq_disable(); + scd = this_scd(); /* * Attempt to make the (initial) unstable->stable transition continuous. */ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); + local_irq_enable(); printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", scd->tick_gtod, __gtod_offset, -- cgit v1.2.3 From 41c25707d21716826e3c1f60967f5550610ec1c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 May 2017 12:03:48 -0400 Subject: cpuset: consider dying css as offline In most cases, a cgroup controller don't care about the liftimes of cgroups. For the controller, a css becomes online when ->css_online() is called on it and offline when ->css_offline() is called. However, cpuset is special in that the user interface it exposes cares whether certain cgroups exist or not. Combined with the RCU delay between cgroup removal and css offlining, this can lead to user visible behavior oddities where operations which should succeed after cgroup removals fail for some time period. The effects of cgroup removals are delayed when seen from userland. This patch adds css_is_dying() which tests whether offline is pending and updates is_cpuset_online() so that the function returns false also while offline is pending. This gets rid of the userland visible delays. Signed-off-by: Tejun Heo Reported-by: Daniel Jordan Link: http://lkml.kernel.org/r/327ca1f5-7957-fbb9-9e5f-9ba149d40ba2@oracle.com Cc: stable@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f6501f4f6040..ae643412948a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -176,9 +176,9 @@ typedef enum { } cpuset_flagbits_t; /* convenient tests for these bits */ -static inline bool is_cpuset_online(const struct cpuset *cs) +static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags); + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) -- cgit v1.2.3 From f36776fafbaa0094390dd4e7e3e29805e0b82730 Mon Sep 17 00:00:00 2001 From: Peter Rajnoha Date: Tue, 9 May 2017 15:22:30 +0200 Subject: kobject: support passing in variables for synthetic uevents This patch makes it possible to pass additional arguments in addition to uevent action name when writing /sys/.../uevent attribute. These additional arguments are then inserted into generated synthetic uevent as additional environment variables. Before, we were not able to pass any additional uevent environment variables for synthetic uevents. This made it hard to identify such uevents properly in userspace to make proper distinction between genuine uevents originating from kernel and synthetic uevents triggered from userspace. Also, it was not possible to pass any additional information which would make it possible to optimize and change the way the synthetic uevents are processed back in userspace based on the originating environment of the triggering action in userspace. With the extra additional variables, we are able to pass through this extra information needed and also it makes it possible to synchronize with such synthetic uevents as they can be clearly identified back in userspace. The format for writing the uevent attribute is following: ACTION [UUID [KEY=VALUE ...] There's no change in how "ACTION" is recognized - it stays the same ("add", "change", "remove"). The "ACTION" is the only argument required to generate synthetic uevent, the rest of arguments, that this patch adds support for, are optional. The "UUID" is considered as transaction identifier so it's possible to use the same UUID value for one or more synthetic uevents in which case we logically group these uevents together for any userspace listeners. The "UUID" is expected to be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" format where "x" is a hex digit. The value appears in uevent as "SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment variable. The "KEY=VALUE" pairs can contain alphanumeric characters only. It's possible to define zero or more more pairs - each pair is then delimited by a space character " ". Each pair appears in synthetic uevents as "SYNTH_ARG_KEY=VALUE" environment variable. That means the KEY name gains "SYNTH_ARG_" prefix to avoid possible collisions with existing variables. To pass the "KEY=VALUE" pairs, it's also required to pass in the "UUID" part for the synthetic uevent first. If "UUID" is not passed in, the generated synthetic uevent gains "SYNTH_UUID=0" environment variable automatically so it's possible to identify this situation in userspace when reading generated uevent and so we can still make a difference between genuine and synthetic uevents. Signed-off-by: Peter Rajnoha Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..d7eb41d772c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1202,10 +1202,7 @@ static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, const char *buffer, size_t count) { - enum kobject_action action; - - if (kobject_action_type(buffer, count, &action) == 0) - kobject_uevent(&mk->kobj, action); + kobject_synth_uevent(&mk->kobj, buffer, count); return count; } -- cgit v1.2.3 From 1ad2f5838d345e1c102bd1cd27c4f4c1349b0dc8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:05 +0200 Subject: bpf: fix incorrect pruning decision when alignment must be tracked Currently, when we enforce alignment tracking on direct packet access, the verifier lets the following program pass despite doing a packet write with unaligned access: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (bf) r0 = r2 4: (07) r0 += 14 5: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 6: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 7: (63) *(u32 *)(r0 -4) = r0 8: (b7) r0 = 0 9: (95) exit from 6 to 8: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp 8: (b7) r0 = 0 9: (95) exit from 5 to 10: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R10=fp 10: (07) r0 += 1 11: (05) goto pc-6 6: safe <----- here, wrongly found safe processed 15 insns However, if we enforce a pruning mismatch by adding state into r8 which is then being mismatched in states_equal(), we find that for the otherwise same program, the verifier detects a misaligned packet access when actually walking that path: 0: (61) r2 = *(u32 *)(r1 +76) 1: (61) r3 = *(u32 *)(r1 +80) 2: (61) r7 = *(u32 *)(r1 +8) 3: (b7) r8 = 1 4: (bf) r0 = r2 5: (07) r0 += 14 6: (25) if r7 > 0x1 goto pc+4 R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 9: (b7) r0 = 0 10: (95) exit from 7 to 9: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=0,max_value=1 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 9: (b7) r0 = 0 10: (95) exit from 6 to 11: R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0) R3=pkt_end R7=inv,min_value=2 R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp 11: (07) r0 += 1 12: (b7) r8 = 0 13: (05) goto pc-7 <----- mismatch due to r8 7: (2d) if r0 > r3 goto pc+1 R0=pkt(id=0,off=15,r=15) R1=ctx R2=pkt(id=0,off=0,r=15) R3=pkt_end R7=inv,min_value=2 R8=imm0,min_value=0,max_value=0,min_align=2147483648 R10=fp 8: (63) *(u32 *)(r0 -4) = r0 misaligned packet access off 2+15+-4 size 4 The reason why we fail to see it in states_equal() is that the third test in compare_ptrs_to_packet() ... if (old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; ... will let the above pass. The situation we run into is that old->off <= cur->off (14 <= 15), meaning that prior walked paths went with smaller offset, which was later used in the packet access after successful packet range check and found to be safe already. For example: Given is R0=pkt(id=0,off=0,r=0). Adding offset 14 as in above program to it, results in R0=pkt(id=0,off=14,r=0) before the packet range test. Now, testing this against R3=pkt_end with 'if r0 > r3 goto out' will transform R0 into R0=pkt(id=0,off=14,r=14) for the case when we're within bounds. A write into the packet at offset *(u32 *)(r0 -4), that is, 2 + 14 -4, is valid and aligned (2 is for NET_IP_ALIGN). After processing this with all fall-through paths, we later on check paths from branches. When the above skb->mark test is true, then we jump near the end of the program, perform r0 += 1, and jump back to the 'if r0 > r3 goto out' test we've visited earlier already. This time, R0 is of type R0=pkt(id=0,off=15,r=0), and we'll prune that part because this time we'll have a larger safe packet range, and we already found that with off=14 all further insn were already safe, so it's safe as well with a larger off. However, the problem is that the subsequent write into the packet with 2 + 15 -4 is then unaligned, and not caught by the alignment tracking. Note that min_align, aux_off, and aux_off_align were all 0 in this example. Since we cannot tell at this time what kind of packet access was performed in the prior walk and what minimal requirements it has (we might do so in the future, but that requires more complexity), fix it to disable this pruning case for strict alignment for now, and let the verifier do check such paths instead. With that applied, the test cases pass and reject the program due to misalignment. Fixes: d1174416747d ("bpf: Track alignment of register values in the verifier.") Reference: http://patchwork.ozlabs.org/patch/761909/ Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c72cd41f5b8b..e37e06b1229d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -843,9 +843,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, { bool strict = env->strict_alignment; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - strict = true; - switch (reg->type) { case PTR_TO_PACKET: return check_pkt_ptr_alignment(reg, off, size, strict); @@ -2696,7 +2693,8 @@ err_free: /* the following conditions reduce the number of explored insns * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet */ -static bool compare_ptrs_to_packet(struct bpf_reg_state *old, +static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, + struct bpf_reg_state *old, struct bpf_reg_state *cur) { if (old->id != cur->id) @@ -2739,7 +2737,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, * 'if (R4 > data_end)' and all further insn were already good with r=20, * so they will be good with r=30 and we can prune the search. */ - if (old->off <= cur->off && + if (!env->strict_alignment && old->off <= cur->off && old->off >= old->range && cur->off >= cur->range) return true; @@ -2810,7 +2808,7 @@ static bool states_equal(struct bpf_verifier_env *env, continue; if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && - compare_ptrs_to_packet(rold, rcur)) + compare_ptrs_to_packet(env, rold, rcur)) continue; return false; @@ -3588,10 +3586,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) } else { log_level = 0; } - if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) + + env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - else - env->strict_alignment = false; ret = replace_map_fd_with_map_ptr(env); if (ret < 0) @@ -3697,7 +3695,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, mutex_lock(&bpf_verifier_lock); log_level = 0; + env->strict_alignment = false; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + env->strict_alignment = true; env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), -- cgit v1.2.3 From a9789ef9afcb4fb0193f8dd94f2665ba3ad71e79 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:06 +0200 Subject: bpf: properly reset caller saved regs after helper call and ld_abs/ind Currently, after performing helper calls, we clear all caller saved registers, that is r0 - r5 and fill r0 depending on struct bpf_func_proto specification. The way we reset these regs can affect pruning decisions in later paths, since we only reset register's imm to 0 and type to NOT_INIT. However, we leave out clearing of other variables such as id, min_value, max_value, etc, which can later on lead to pruning mismatches due to stale data. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e37e06b1229d..339c8a1371de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -463,19 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + + memset(®s[regno], 0, sizeof(regs[regno])); + regs[regno].type = NOT_INIT; + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; + regs[regno].max_value = BPF_REGISTER_MAX_RANGE; +} + static void init_reg_state(struct bpf_reg_state *regs) { int i; - for (i = 0; i < MAX_BPF_REG; i++) { - regs[i].type = NOT_INIT; - regs[i].imm = 0; - regs[i].min_value = BPF_REGISTER_MIN_RANGE; - regs[i].max_value = BPF_REGISTER_MAX_RANGE; - regs[i].min_align = 0; - regs[i].aux_off = 0; - regs[i].aux_off_align = 0; - } + for (i = 0; i < MAX_BPF_REG; i++) + mark_reg_not_init(regs, i); /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; @@ -1346,7 +1349,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs = state->regs; - struct bpf_reg_state *reg; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1413,11 +1415,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) } /* reset caller saved regs */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* update return register */ if (fn->ret_type == RET_INTEGER) { @@ -2445,7 +2444,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; u8 mode = BPF_MODE(insn->code); - struct bpf_reg_state *reg; int i, err; if (!may_access_skb(env->prog->type)) { @@ -2478,11 +2476,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* reset caller saved regs to unreadable */ - for (i = 0; i < CALLER_SAVED_REGS; i++) { - reg = regs + caller_saved[i]; - reg->type = NOT_INIT; - reg->imm = 0; - } + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(regs, caller_saved[i]); /* mark destination R0 register as readable, since it contains * the value fetched from the packet -- cgit v1.2.3 From a316338cb71a3260201490e615f2f6d5c0d8fb2c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 25 May 2017 01:05:08 +0200 Subject: bpf: fix wrong exposure of map_flags into fdinfo for lpm trie_alloc() always needs to have BPF_F_NO_PREALLOC passed in via attr->map_flags, since it does not support preallocation yet. We check the flag, but we never copy the flag into trie->map.map_flags, which is later on exposed into fdinfo and used by loaders such as iproute2. Latter uses this in bpf_map_selfcheck_pinned() to test whether a pinned map has the same spec as the one from the BPF obj file and if not, bails out, which is currently the case for lpm since it exposes always 0 as flags. Also copy over flags in array_map_alloc() and stack_map_alloc(). They always have to be 0 right now, but we should make sure to not miss to copy them over at a later point in time when we add actual flags for them to use. Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation") Reported-by: Jarno Rajahalme Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 1 + kernel/bpf/lpm_trie.c | 1 + kernel/bpf/stackmap.c | 1 + 3 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e00b2333c26..172dc8ee0e3b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; + array->map.map_flags = attr->map_flags; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 39cfafd895b8..b09185f0f17d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.key_size = attr->key_size; trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; + trie->map.map_flags = attr->map_flags; trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4dfd6f2ec2f9..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.key_size = attr->key_size; smap->map.value_size = value_size; smap->map.max_entries = attr->max_entries; + smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; -- cgit v1.2.3 From 9ab6055f959032258c0f83a070cd0d26ed7a8fc5 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 24 May 2017 17:55:10 -0600 Subject: kernel/locking: Fix compile error with qrwlock.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Saw these compile errors on SPARC when queued rwlock feature is enabled. CC kernel/locking/qrwlock.o kernel/locking/qrwlock.c: In function ‘queued_read_lock_slowpath’: kernel/locking/qrwlock.c:89: error: implicit declaration of function ‘arch_spin_lock’ kernel/locking/qrwlock.c:102: error: implicit declaration of function ‘arch_spin_unlock’ make[4]: *** [kernel/locking/qrwlock.o] Error 1 Include spinlock.h in qrwlock.c to fix it. Signed-off-by: Babu Moger Reviewed-by: Håkon Bugge Reviewed-by: Jane Chu Reviewed-by: Shannon Nelson Reviewed-by: Vijay Kumar Signed-off-by: David S. Miller --- kernel/locking/qrwlock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index cc3ed0ccdfa2..2655f26ec882 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -20,6 +20,7 @@ #include #include #include +#include #include /* -- cgit v1.2.3 From 8f553c498e1772cccb39a114da4a498d22992758 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:12 +0200 Subject: cpu/hotplug: Provide cpus_read|write_[un]lock() The counting 'rwsem' hackery of get|put_online_cpus() is going to be replaced by percpu rwsem. Rename the functions to make it clear that it's locking and not some refcount style interface. These new functions will be used for the preparatory patches which make the code ready for the percpu rwsem conversion. Rename all instances in the cpu hotplug code while at it. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.080397752@linutronix.de --- kernel/cpu.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..d3221ae5b474 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,7 +235,7 @@ static struct { #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) -void get_online_cpus(void) +void cpus_read_lock(void) { might_sleep(); if (cpu_hotplug.active_writer == current) @@ -245,9 +245,9 @@ void get_online_cpus(void) atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); } -EXPORT_SYMBOL_GPL(get_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_lock); -void put_online_cpus(void) +void cpus_read_unlock(void) { int refcount; @@ -264,7 +264,7 @@ void put_online_cpus(void) cpuhp_lock_release(); } -EXPORT_SYMBOL_GPL(put_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_unlock); /* * This ensures that the hotplug operation can begin only when the @@ -288,7 +288,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); * get_online_cpus() not an api which is called all that often. * */ -void cpu_hotplug_begin(void) +void cpus_write_lock(void) { DEFINE_WAIT(wait); @@ -306,7 +306,7 @@ void cpu_hotplug_begin(void) finish_wait(&cpu_hotplug.wq, &wait); } -void cpu_hotplug_done(void) +void cpus_write_unlock(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); @@ -773,7 +773,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; - cpu_hotplug_begin(); + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -811,7 +811,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, } out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -893,7 +893,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) struct task_struct *idle; int ret = 0; - cpu_hotplug_begin(); + cpus_write_lock(); if (!cpu_present(cpu)) { ret = -EINVAL; @@ -941,7 +941,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, target); out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -1424,7 +1424,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, if (sp->multi_instance == false) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) @@ -1453,7 +1453,7 @@ add_node: hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); @@ -1486,7 +1486,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpuhp_cb_check(state) || !name) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, @@ -1522,7 +1522,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the * dynamically allocated state in case of success. @@ -1544,7 +1544,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, if (!sp->multi_instance) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !cpuhp_get_teardown_cb(state)) @@ -1565,7 +1565,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, remove: hlist_del(node); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1587,7 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) BUG_ON(cpuhp_cb_check(state)); - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { @@ -1615,7 +1615,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); -- cgit v1.2.3 From 71def423fe3da0d40ad3427a4cd5f9edc53bff67 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:14 +0200 Subject: cpu/hotplug: Provide cpuhp_setup/remove_state[_nocalls]_cpuslocked() Some call sites of cpuhp_setup/remove_state[_nocalls]() are within a cpus_read locked region. cpuhp_setup/remove_state[_nocalls]() call cpus_read_lock() as well, which is possible in the current implementation but prevents converting the hotplug locking to a percpu rwsem. Provide locked versions of the interfaces to avoid nested calls to cpus_read_lock(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.239600868@linutronix.de --- kernel/cpu.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d3221ae5b474..dc27c5a28153 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1459,7 +1459,7 @@ unlock: EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** - * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state @@ -1468,25 +1468,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * @multi_instance: State is set up for multiple instances which get * added afterwards. * + * The caller needs to hold cpus read locked while calling this function. * Returns: * On success: * Positive state number if @state is CPUHP_AP_ONLINE_DYN * 0 for all other states * On failure: proper (negative) error code */ -int __cpuhp_setup_state(enum cpuhp_state state, - const char *name, bool invoke, - int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu), - bool multi_instance) +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; bool dynstate; + lockdep_assert_cpus_held(); + if (cpuhp_cb_check(state) || !name) return -EINVAL; - cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, @@ -1522,7 +1524,6 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: mutex_unlock(&cpuhp_state_mutex); - cpus_read_unlock(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the * dynamically allocated state in case of success. @@ -1531,6 +1532,22 @@ out: return state; return ret; } +EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked); + +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup, + teardown, multi_instance); + cpus_read_unlock(); + return ret; +} EXPORT_SYMBOL(__cpuhp_setup_state); int __cpuhp_state_remove_instance(enum cpuhp_state state, @@ -1572,22 +1589,23 @@ remove: EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** - * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state * @state: The state to remove * @invoke: If true, the teardown function is invoked for cpus where * cpu state >= @state * + * The caller needs to hold cpus read locked while calling this function. * The teardown callback is currently not allowed to fail. Think * about module removal! */ -void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); - cpus_read_lock(); + lockdep_assert_cpus_held(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { @@ -1615,6 +1633,13 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); +} +EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked); + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + cpus_read_lock(); + __cpuhp_remove_state_cpuslocked(state, invoke); cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); -- cgit v1.2.3 From 9805c6733349ea3ccd22cf75b8ebaabb5290e310 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:15 +0200 Subject: cpu/hotplug: Add __cpuhp_state_add_instance_cpuslocked() Add cpuslocked() variants for the multi instance registration so this can be called from a cpus_read_lock() protected region. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.321782217@linutronix.de --- kernel/cpu.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index dc27c5a28153..e4389ac55b65 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1413,18 +1413,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, } } -int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, - bool invoke) +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, + bool invoke) { struct cpuhp_step *sp; int cpu; int ret; + lockdep_assert_cpus_held(); + sp = cpuhp_get_step(state); if (sp->multi_instance == false) return -EINVAL; - cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) @@ -1453,6 +1455,16 @@ add_node: hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); + return ret; +} + +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke); cpus_read_unlock(); return ret; } -- cgit v1.2.3 From fe5595c074005bd94f0c7d1644175941149f6768 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:16 +0200 Subject: stop_machine: Provide stop_machine_cpuslocked() Some call sites of stop_machine() are within a get_online_cpus() protected region. stop_machine() calls get_online_cpus() as well, which is possible in the current implementation but prevents converting the hotplug locking to a percpu rwsem. Provide stop_machine_cpuslocked() to avoid nested calls to get_online_cpus(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.400700852@linutronix.de --- kernel/stop_machine.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1eb82661ecdb..b7591261652d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -552,7 +552,8 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp .active_cpus = cpus, }; + lockdep_assert_cpus_held(); + if (!stop_machine_initialized) { /* * Handle the case where stop_machine() is called @@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) int ret; /* No CPUs can come up or down during this. */ - get_online_cpus(); - ret = __stop_machine(fn, data, cpus); - put_online_cpus(); + cpus_read_lock(); + ret = stop_machine_cpuslocked(fn, data, cpus); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); -- cgit v1.2.3 From 9596695ee1e7eedd743c43811fe68299eb005b5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:17 +0200 Subject: padata: Make padata_alloc() static No users outside of padata.c Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Steffen Klassert Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/20170524081547.491457256@linutronix.de --- kernel/padata.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index ac8f1e524836..0c708f648853 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -933,19 +933,6 @@ static struct kobj_type padata_attr_type = { .release = padata_sysfs_release, }; -/** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @wq: workqueue to use for the allocated padata instance - */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) -{ - return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); - /** * padata_alloc - allocate and initialize a padata instance and specify * cpumasks for serial and parallel workers. @@ -954,9 +941,9 @@ EXPORT_SYMBOL(padata_alloc_possible); * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization */ -struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; struct parallel_data *pd = NULL; @@ -1010,6 +997,19 @@ err: return NULL; } +/** + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + /** * padata_free - free a padata instance * -- cgit v1.2.3 From c5a81c8ff816d89941fe86961b286765d6ca2f5f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:18 +0200 Subject: padata: Avoid nested calls to cpus_read_lock() in pcrypt_init_padata() pcrypt_init_padata() cpus_read_lock() padata_alloc_possible() padata_alloc() cpus_read_lock() The nested call to cpus_read_lock() works with the current implementation, but prevents the conversion to a percpu rwsem. The other caller of padata_alloc_possible() is pcrypt_init_padata() which calls from a cpus_read_lock() protected region as well. Remove the cpus_read_lock() call in padata_alloc() and document the calling convention. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Steffen Klassert Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/20170524081547.571278910@linutronix.de --- kernel/padata.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 0c708f648853..868f947166d7 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -940,6 +940,8 @@ static struct kobj_type padata_attr_type = { * @wq: workqueue to use for the allocated padata instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization + * + * Must be called from a cpus_read_lock() protected region */ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, const struct cpumask *pcpumask, @@ -952,7 +954,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, if (!pinst) goto err; - get_online_cpus(); if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { @@ -976,14 +977,12 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, pinst->flags = 0; - put_online_cpus(); - BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); #endif return pinst; @@ -992,7 +991,6 @@ err_free_masks: free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); - put_online_cpus(); err: return NULL; } @@ -1003,9 +1001,12 @@ err: * parallel workers. * * @wq: workqueue to use for the allocated padata instance + * + * Must be called from a cpus_read_lock() protected region */ struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) { + lockdep_assert_cpus_held(); return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); } EXPORT_SYMBOL(padata_alloc_possible); -- cgit v1.2.3 From 210e21331fc3a396af640cec652be769d146e49f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:28 +0200 Subject: cpu/hotplug: Use stop_machine_cpuslocked() in takedown_cpu() takedown_cpu() is a cpu hotplug function invoking stop_machine(). The cpu hotplug machinery holds the hotplug lock for write. stop_machine() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use stop_machine_cpuslocked() to avoid the nested call. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081548.423292433@linutronix.de --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index e4389ac55b65..142d889d9f69 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -701,7 +701,7 @@ static int takedown_cpu(unsigned int cpu) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); + err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ irq_unlock_sparse(); -- cgit v1.2.3 From a63fbed776c7124ce9f606234267c3c095b2680e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:34 +0200 Subject: perf/tracing/cpuhotplug: Fix locking order perf, tracing, kprobes and jump_labels have a gazillion of ways to create dependency lock chains. Some of those involve nested invocations of get_online_cpus(). The conversion of the hotplug locking to a percpu rwsem requires to avoid such nested calls. sys_perf_event_open() protects most of the syscall logic against cpu hotplug. This causes nested calls and lock inversions versus ftrace and kprobes in various interesting ways. It's impossible to move the hotplug locking to the outer end of all call chains in the involved facilities, so the hotplug protection in sys_perf_event_open() needs to be solved differently. Introduce 'pmus_mutex' which protects a perf private online cpumask. This mutex is taken when the mask is updated in the cpu hotplug callbacks and can be taken in sys_perf_event_open() to protect the swhash setup/teardown code and when the final judgement about a valid event has to be made. [ tglx: Produced changelog and fixed the swhash interaction ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Link: http://lkml.kernel.org/r/20170524081548.930941109@linutronix.de --- kernel/events/core.c | 106 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..b97cda4d1777 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; +static cpumask_var_t perf_online_mask; /* * perf event paranoia level: @@ -3812,14 +3813,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); @@ -7703,7 +7696,8 @@ static int swevent_hlist_get_cpu(int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && + cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -7724,7 +7718,7 @@ static int swevent_hlist_get(void) { int err, cpu, failed_cpu; - get_online_cpus(); + mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { @@ -7732,8 +7726,7 @@ static int swevent_hlist_get(void) goto fail; } } - put_online_cpus(); - + mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { @@ -7741,8 +7734,7 @@ fail: break; swevent_hlist_put_cpu(cpu); } - - put_online_cpus(); + mutex_unlock(&pmus_lock); return err; } @@ -8920,7 +8912,7 @@ perf_event_mux_interval_ms_store(struct device *dev, pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); @@ -8929,7 +8921,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpu_function_call(cpu, (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; @@ -9059,6 +9051,7 @@ skip_type: lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); } @@ -9882,12 +9875,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - get_online_cpus(); - if (task) { err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (err) - goto err_cpus; + goto err_cred; /* * Reuse ptrace permission checks for now. @@ -10073,6 +10064,23 @@ SYSCALL_DEFINE5(perf_event_open, goto err_locked; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } + } + + /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. @@ -10162,8 +10170,6 @@ SYSCALL_DEFINE5(perf_event_open, put_task_struct(task); } - put_online_cpus(); - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -10197,8 +10203,6 @@ err_alloc: err_cred: if (task) mutex_unlock(&task->signal->cred_guard_mutex); -err_cpus: - put_online_cpus(); err_task: if (task) put_task_struct(task); @@ -10253,6 +10257,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + if (!cpuctx->online) { + err = -ENODEV; + goto err_unlock; + } + } + if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_unlock; @@ -10920,6 +10939,8 @@ static void __init perf_event_init_all_cpus(void) struct swevent_htable *swhash; int cpu; + zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); @@ -10935,7 +10956,7 @@ static void __init perf_event_init_all_cpus(void) } } -int perf_event_init_cpu(unsigned int cpu) +void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10948,7 +10969,6 @@ int perf_event_init_cpu(unsigned int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); - return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10966,19 +10986,22 @@ static void __perf_event_exit_context(void *__info) static void perf_event_exit_cpu_context(int cpu) { + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; struct pmu *pmu; - int idx; - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + mutex_lock(&pmus_lock); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; mutex_unlock(&ctx->mutex); } - srcu_read_unlock(&pmus_srcu, idx); + cpumask_clear_cpu(cpu, perf_online_mask); + mutex_unlock(&pmus_lock); } #else @@ -10986,6 +11009,29 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +int perf_event_init_cpu(unsigned int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + + perf_swevent_init_cpu(cpu); + + mutex_lock(&pmus_lock); + cpumask_set_cpu(cpu, perf_online_mask); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); + } + mutex_unlock(&pmus_lock); + + return 0; +} + int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); -- cgit v1.2.3 From f2545b2d4ce13e068897ef60ae64dffe215f4152 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:35 +0200 Subject: jump_label: Reorder hotplug lock and jump_label_lock The conversion of the hotplug locking to a percpu rwsem unearthed lock ordering issues all over the place. The jump_label code has two issues: 1) Nested get_online_cpus() invocations 2) Ordering problems vs. the cpus rwsem and the jump_label_mutex To cure these, the following lock order has been established; cpus_rwsem -> jump_label_lock -> text_mutex Even if not all architectures need protection against CPU hotplug, taking cpus_rwsem before jump_label_lock is now mandatory in code pathes which actually modify code and therefor need text_mutex protection. Move the get_online_cpus() invocations into the core jump label code and establish the proper lock order where required. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: "David S. Miller" Cc: Paul E. McKenney Cc: Chris Metcalf Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Jason Baron Cc: Ralf Baechle Link: http://lkml.kernel.org/r/20170524081549.025830817@linutronix.de --- kernel/jump_label.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6c9cb208ac48..d11c506a6ac3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL @@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key) return; } + cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); @@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key) atomic_inc(&key->enabled); } jump_label_unlock(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + cpus_read_lock(); /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); + cpus_read_unlock(); return; } @@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); + cpus_read_unlock(); } static void jump_label_update_timeout(struct work_struct *work) @@ -334,6 +340,7 @@ void __init jump_label_init(void) if (static_key_initialized) return; + cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); @@ -353,6 +360,7 @@ void __init jump_label_init(void) } static_key_initialized = true; jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_MODULES @@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, struct module *mod = data; int ret = 0; + cpus_read_lock(); + jump_label_lock(); + switch (val) { case MODULE_STATE_COMING: - jump_label_lock(); ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } - jump_label_unlock(); break; case MODULE_STATE_GOING: - jump_label_lock(); jump_label_del_module(mod); - jump_label_unlock(); break; case MODULE_STATE_LIVE: - jump_label_lock(); jump_label_invalidate_module_init(mod); - jump_label_unlock(); break; } + jump_label_unlock(); + cpus_read_unlock(); + return notifier_from_errno(ret); } -- cgit v1.2.3 From 2d1e38f56622b9bb5af85be63c1052c056f5c677 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:36 +0200 Subject: kprobes: Cure hotplug lock ordering issues Converting the cpu hotplug locking to a percpu rwsem unearthed hidden lock ordering problems. There is a wide range of locks involved in this: kprobe_mutex, jump_label_mutex, ftrace_lock, text_mutex, event_mutex, module_mutex, func_hash->regex_lock and a gazillion of lock order permutations with nested get_online_cpus() calls. Some of those permutations are potential deadlocks even with the current nesting hotplug locking scheme, but they can't be discovered by lockdep. The conversion of the hotplug locking to a percpu rwsem requires to prevent nested locking, so it's required to take the hotplug rwsem early in the call chain and establish a proper lock order. After quite some analysis and going down the wrong road severa times the following lock order has been chosen: kprobe_mutex -> cpus_rwsem -> jump_label_mutex -> text_mutex For kprobes which hook on an ftrace function trace point, it's required to drop cpus_rwsem before calling into the ftrace code to avoid a deadlock on the func_hash->regex_lock. [ Steven: Ftrace interaction fixes ] Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Masami Hiramatsu Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sebastian Siewior Link: http://lkml.kernel.org/r/20170524081549.104864779@linutronix.de --- kernel/kprobes.c | 59 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d2d3a568e4e..9f6056749a28 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { - /* Optimization never be done when disarmed */ - if (kprobes_all_disarmed || !kprobes_allow_optimization || - list_empty(&optimizing_list)) - return; - /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -495,14 +490,19 @@ static void do_optimize_kprobes(void) * This combination can cause a deadlock (cpu-hotplug try to lock * text_mutex but stop_machine can not be done because online_cpus * has been changed) - * To avoid this deadlock, we need to call get_online_cpus() + * To avoid this deadlock, caller must have locked cpu hotplug * for preventing cpu-hotplug outside of text_mutex locking. */ - get_online_cpus(); + lockdep_assert_cpus_held(); + + /* Optimization never be done when disarmed */ + if (kprobes_all_disarmed || !kprobes_allow_optimization || + list_empty(&optimizing_list)) + return; + mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); mutex_unlock(&text_mutex); - put_online_cpus(); } /* @@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + /* See comment in do_optimize_kprobes() */ + lockdep_assert_cpus_held(); + /* Unoptimization must be done anytime */ if (list_empty(&unoptimizing_list)) return; - /* Ditto to do_optimize_kprobes */ - get_online_cpus(); mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ @@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void) list_del_init(&op->list); } mutex_unlock(&text_mutex); - put_online_cpus(); } /* Reclaim all kprobes on the free_list */ @@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void) static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); + cpus_read_lock(); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ @@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p) /* Short cut to direct unoptimizing */ static void force_unoptimize_kprobe(struct optimized_kprobe *op) { - get_online_cpus(); + lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); - put_online_cpus(); if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); } @@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) return; /* For preparing optimization, jump_label_text_reserved() is called */ + cpus_read_lock(); jump_label_lock(); mutex_lock(&text_mutex); @@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_SYSCTL @@ -826,6 +829,7 @@ static void optimize_all_kprobes(void) if (kprobes_allow_optimization) goto out; + cpus_read_lock(); kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -833,6 +837,7 @@ static void optimize_all_kprobes(void) if (!kprobe_disabled(p)) optimize_kprobe(p); } + cpus_read_unlock(); printk(KERN_INFO "Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); @@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void) return; } + cpus_read_lock(); kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void) unoptimize_kprobe(p, false); } } + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Wait for unoptimizing completion */ @@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp) arm_kprobe_ftrace(kp); return; } - /* - * Here, since __arm_kprobe() doesn't use stop_machine(), - * this doesn't cause deadlock on text_mutex. So, we don't - * need get_online_cpus(). - */ + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* Disarm a kprobe with text_mutex */ @@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt) disarm_kprobe_ftrace(kp); return; } - /* Ditto */ + + cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* @@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) int ret = 0; struct kprobe *ap = orig_p; + cpus_read_lock(); + /* For preparing optimization, jump_label_text_reserved() is called */ jump_label_lock(); - /* - * Get online CPUs to avoid text_mutex deadlock.with stop machine, - * which is invoked by unoptimize_kprobe() in add_new_kprobe() - */ - get_online_cpus(); mutex_lock(&text_mutex); if (!kprobe_aggrprobe(orig_p)) { @@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) out: mutex_unlock(&text_mutex); - put_online_cpus(); jump_label_unlock(); + cpus_read_unlock(); if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; @@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p) goto out; } - mutex_lock(&text_mutex); /* Avoiding text modification */ + cpus_read_lock(); + /* Prevent text modification */ + mutex_lock(&text_mutex); ret = prepare_kprobe(p); mutex_unlock(&text_mutex); + cpus_read_unlock(); if (ret) goto out; @@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p) /* Try to optimize kprobe */ try_to_optimize_kprobe(p); - out: mutex_unlock(&kprobe_mutex); -- cgit v1.2.3 From fc8dffd379ca5620664336eb895a426b42847558 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:40 +0200 Subject: cpu/hotplug: Convert hotplug locking to percpu rwsem There are no more (known) nested calls to get_online_cpus() and all observed lock ordering problems have been addressed. Replace the magic nested 'rwsem' hackery with a percpu-rwsem. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081549.447014063@linutronix.de --- kernel/cpu.c | 107 ++++++++--------------------------------------------------- 1 file changed, 13 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 142d889d9f69..66836216ebae 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -196,121 +197,41 @@ void cpu_maps_update_done(void) mutex_unlock(&cpu_add_remove_lock); } -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. +/* + * If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; #ifdef CONFIG_HOTPLUG_CPU -static struct { - struct task_struct *active_writer; - /* wait queue to wake up the active_writer */ - wait_queue_head_t wq; - /* verifies that no writer will get active while readers are active */ - struct mutex lock; - /* - * Also blocks the new readers during - * an ongoing cpu hotplug operation. - */ - atomic_t refcount; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} cpu_hotplug = { - .active_writer = NULL, - .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), - .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), -#ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), -#endif -}; - -/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ -#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire_tryread() \ - lock_map_acquire_tryread(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) -#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) - +DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); void cpus_read_lock(void) { - might_sleep(); - if (cpu_hotplug.active_writer == current) - return; - cpuhp_lock_acquire_read(); - mutex_lock(&cpu_hotplug.lock); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); + percpu_down_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_lock); void cpus_read_unlock(void) { - int refcount; - - if (cpu_hotplug.active_writer == current) - return; - - refcount = atomic_dec_return(&cpu_hotplug.refcount); - if (WARN_ON(refcount < 0)) /* try to fix things up */ - atomic_inc(&cpu_hotplug.refcount); - - if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) - wake_up(&cpu_hotplug.wq); - - cpuhp_lock_release(); - + percpu_up_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_unlock); -/* - * This ensures that the hotplug operation can begin only when the - * refcount goes to zero. - * - * Note that during a cpu-hotplug operation, the new readers, if any, - * will be blocked by the cpu_hotplug.lock - * - * Since cpu_hotplug_begin() is always called after invoking - * cpu_maps_update_begin(), we can be sure that only one writer is active. - * - * Note that theoretically, there is a possibility of a livelock: - * - Refcount goes to zero, last reader wakes up the sleeping - * writer. - * - Last reader unlocks the cpu_hotplug.lock. - * - A new reader arrives at this moment, bumps up the refcount. - * - The writer acquires the cpu_hotplug.lock finds the refcount - * non zero and goes to sleep again. - * - * However, this is very difficult to achieve in practice since - * get_online_cpus() not an api which is called all that often. - * - */ void cpus_write_lock(void) { - DEFINE_WAIT(wait); - - cpu_hotplug.active_writer = current; - cpuhp_lock_acquire(); - - for (;;) { - mutex_lock(&cpu_hotplug.lock); - prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); - if (likely(!atomic_read(&cpu_hotplug.refcount))) - break; - mutex_unlock(&cpu_hotplug.lock); - schedule(); - } - finish_wait(&cpu_hotplug.wq, &wait); + percpu_down_write(&cpu_hotplug_lock); } void cpus_write_unlock(void) { - cpu_hotplug.active_writer = NULL; - mutex_unlock(&cpu_hotplug.lock); - cpuhp_lock_release(); + percpu_up_write(&cpu_hotplug_lock); +} + +void lockdep_assert_cpus_held(void) +{ + percpu_rwsem_assert_held(&cpu_hotplug_lock); } /* @@ -344,8 +265,6 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -/* Notifier wrappers for transitioning to state machine */ - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); -- cgit v1.2.3 From 49dfe2a6779717d9c18395684ee31bdc98b22e53 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:43 +0200 Subject: cpuhotplug: Link lock stacks for hotplug callbacks The CPU hotplug callbacks are not covered by lockdep versus the cpu hotplug rwsem. CPU0 CPU1 cpuhp_setup_state(STATE, startup, teardown); cpus_read_lock(); invoke_callback_on_ap(); kick_hotplug_thread(ap); wait_for_completion(); hotplug_thread_fn() lock(m); do_stuff(); unlock(m); Lockdep does not know about this dependency and will not trigger on the following code sequence: lock(m); cpus_read_lock(); Add a lockdep map and connect the initiators lock chain with the hotplug thread lock chain, so potential deadlocks can be detected. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081549.709375845@linutronix.de --- kernel/cpu.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 66836216ebae..7435ffc6163b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -66,6 +66,12 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) +static struct lock_class_key cpuhp_state_key; +static struct lockdep_map cpuhp_state_lock_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +#endif + /** * cpuhp_step - Hotplug state machine step * @name: Name of the step @@ -403,6 +409,7 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; + lock_map_acquire(&cpuhp_state_lock_map); /* Single callback invocation for [un]install ? */ if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { @@ -429,6 +436,7 @@ static void cpuhp_thread_fun(unsigned int cpu) else if (st->state > st->target) ret = cpuhp_ap_offline(cpu, st); } + lock_map_release(&cpuhp_state_lock_map); st->result = ret; complete(&st->done); } @@ -443,6 +451,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); + /* * If we are up and running, use the hotplug thread. For early calls * we invoke the thread function directly. @@ -486,6 +497,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state state = st->state; trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); __cpuhp_kick_ap_work(st); wait_for_completion(&st->done); trace_cpuhp_exit(cpu, st->state, state, st->result); -- cgit v1.2.3 From 5a29ef22098874db79af7bf92a247a0f503bfa6e Mon Sep 17 00:00:00 2001 From: Vincent Legoll Date: Tue, 9 May 2017 10:34:09 +0200 Subject: genirq: Make early_irq_init() print out more informative The printk in early_irq_init() is cryptic and badly formatted: NR_IRQS:33024 nr_irqs:968 16 The last number is the number of preallocated interrupts, so add a prefix to it: NR_IRQS: 33024, nr_irqs: 968, preallocated irqs: 16 Cleanup the formatting for better readability as well. Signed-off-by: Vincent Legoll Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1494318849-6733-1-git-send-email-vincent.legoll@gmail.com --- kernel/irq/irqdesc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 00bb0aeea1d0..09abce2ea8f0 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -480,7 +480,8 @@ int __init early_irq_init(void) /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", + NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) nr_irqs = IRQ_BITMAP_BITS; @@ -516,7 +517,7 @@ int __init early_irq_init(void) init_irq_default_affinity(); - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); desc = irq_desc; count = ARRAY_SIZE(irq_desc); -- cgit v1.2.3 From 5720acf4bfc142ba568d5b6782fceaf62ed15e0b Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Fri, 26 May 2017 14:45:21 +0200 Subject: livepatch: Make livepatch dependent on !TRIM_UNUSED_KSYMS If TRIM_UNUSED_KSYMS is enabled, all unneeded exported symbols are made unexported. Two-pass build of the kernel is done to find out which symbols are needed based on a configuration. This effectively complicates things for out-of-tree modules. Livepatch exports functions to (un)register and enable/disable a live patch. The only in-tree module which uses these functions is a sample in samples/livepatch/. If the sample is disabled, the functions are trimmed and out-of-tree live patches cannot be built. Note that live patches are intended to be built out-of-tree. Suggested-by: Michal Marek Acked-by: Josh Poimboeuf Acked-by: Jessica Yu Signed-off-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 045022557936..ec4565122e65 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -10,6 +10,7 @@ config LIVEPATCH depends on SYSFS depends on KALLSYMS_ALL depends on HAVE_LIVEPATCH + depends on !TRIM_UNUSED_KSYMS help Say Y here if you want to support kernel live patching. This option has no runtime impact until a kernel "patch" -- cgit v1.2.3 From f9797c2f20c0160edd718aa467101f3301e57e59 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Thu, 25 May 2017 16:20:38 +0100 Subject: ftrace: Fix memory leak in ftrace_graph_release() ftrace_hash is being kfree'ed in ftrace_graph_release(), however the ->buckets field is not. This results in a memory leak that is easily captured by kmemleak: unreferenced object 0xffff880038afe000 (size 8192): comm "trace-cmd", pid 238, jiffies 4294916898 (age 9.736s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] __kmalloc+0x12d/0x1a0 [] alloc_ftrace_hash+0x51/0x80 [] __ftrace_graph_open.isra.39.constprop.46+0xa3/0x100 [] ftrace_graph_open+0x68/0xa0 [] do_dentry_open.isra.1+0x1bd/0x2d0 [] vfs_open+0x47/0x60 [] path_openat+0x2a5/0x1020 [] do_filp_open+0x8a/0xf0 [] do_sys_open+0x12f/0x200 [] SyS_open+0x1e/0x20 [] entry_SYSCALL_64_fastpath+0x13/0x94 [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/20170525152038.7661-1-lhenriques@suse.com Cc: stable@vger.kernel.org Fixes: b9b0c831bed2 ("ftrace: Convert graph filter to use hash tables") Signed-off-by: Luis Henriques Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 74fdfe9ed3db..9e5841dc14b5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5063,7 +5063,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) } out: - kfree(fgd->new_hash); + free_ftrace_hash(fgd->new_hash); kfree(fgd); return ret; -- cgit v1.2.3 From c93f5cf571e7795f97d49ef51b766cf25e328545 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 May 2017 19:38:17 +0900 Subject: kprobes/x86: Fix to set RWX bits correctly before releasing trampoline Fix kprobes to set(recover) RWX bits correctly on trampoline buffer before releasing it. Releasing readonly page to module_memfree() crash the kernel. Without this fix, if kprobes user register a bunch of kprobes in function body (since kprobes on function entry usually use ftrace) and unregister it, kernel hits a BUG and crash. Link: http://lkml.kernel.org/r/149570868652.3518.14120169373590420503.stgit@devbox Signed-off-by: Masami Hiramatsu Fixes: d0381c81c2f7 ("kprobes/x86: Set kprobes pages read-only") Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d2d3a568e4e..adfe3b4cfe05 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -122,7 +122,7 @@ static void *alloc_insn_page(void) return module_alloc(PAGE_SIZE); } -static void free_insn_page(void *page) +void __weak free_insn_page(void *page) { module_memfree(page); } -- cgit v1.2.3 From d3ba5a9a345b1243276f8a982e1bce557c2504fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 May 2017 12:03:11 +0300 Subject: posix-timers: Make posix_clocks immutable There are no more modular users providing a posix clock. The register function is now pointless so the posix clock array can be initialized statically at compile time and the array including the various k_clock structs can be marked 'const'. Inspired by changes in the Grsecurity patch set, but done proper. [ tglx: Massaged changelog and fixed the POSIX_TIMER=n case ] Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: Mike Travis Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/20170526090311.3377-3-hch@lst.de --- kernel/time/alarmtimer.c | 89 ++++++++++--------- kernel/time/posix-clock.c | 2 +- kernel/time/posix-cpu-timers.c | 34 +++----- kernel/time/posix-timers.c | 191 +++++++++++++++++++---------------------- 4 files changed, 146 insertions(+), 170 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5cb5b0008d97..4f4cc3509b30 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -307,38 +307,6 @@ static int alarmtimer_resume(struct device *dev) } #endif -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) -{ - struct alarm_base *base; - unsigned long flags; - ktime_t delta; - - switch(type) { - case ALARM_REALTIME: - base = &alarm_bases[ALARM_REALTIME]; - type = ALARM_REALTIME_FREEZER; - break; - case ALARM_BOOTTIME: - base = &alarm_bases[ALARM_BOOTTIME]; - type = ALARM_BOOTTIME_FREEZER; - break; - default: - WARN_ONCE(1, "Invalid alarm type: %d\n", type); - return; - } - - delta = ktime_sub(absexp, base->gettime()); - - spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta || (delta < freezer_delta)) { - freezer_delta = delta; - freezer_expires = absexp; - freezer_alarmtype = type; - } - spin_unlock_irqrestore(&freezer_delta_lock, flags); -} - - /** * alarm_init - Initialize an alarm structure * @alarm: ptr to alarm to be initialized @@ -488,6 +456,38 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward_now); +#ifdef CONFIG_POSIX_TIMERS + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + struct alarm_base *base; + unsigned long flags; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta || (delta < freezer_delta)) { + freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} /** * clock2alarm - helper that converts from clockid to alarmtypes @@ -846,6 +846,17 @@ out: return ret; } +const struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .nsleep = alarm_timer_nsleep, +}; +#endif /* CONFIG_POSIX_TIMERS */ + /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { @@ -871,23 +882,9 @@ static int __init alarmtimer_init(void) struct platform_device *pdev; int error = 0; int i; - struct k_clock alarm_clock = { - .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, - .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, - .nsleep = alarm_timer_nsleep, - }; alarmtimer_rtc_timer_init(); - if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { - posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); - posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); - } - /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 31d588d37a17..7e453005e078 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -434,7 +434,7 @@ static int pc_timer_settime(struct k_itimer *kit, int flags, return err; } -struct k_clock clock_posix_dynamic = { +const struct k_clock clock_posix_dynamic = { .clock_getres = pc_clock_getres, .clock_set = pc_clock_settime, .clock_get = pc_clock_gettime, diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1370f067fb51..1a522b39f19d 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1413,7 +1413,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } -struct k_clock clock_posix_cpu = { +const struct k_clock clock_posix_cpu = { .clock_getres = posix_cpu_clock_getres, .clock_set = posix_cpu_clock_set, .clock_get = posix_cpu_clock_get, @@ -1425,24 +1425,16 @@ struct k_clock clock_posix_cpu = { .timer_get = posix_cpu_timer_get, }; -static __init int init_posix_cpu_timers(void) -{ - struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, - }; - struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, - }; - - posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); +const struct k_clock clock_process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, +}; - return 0; -} -__initcall(init_posix_cpu_timers); +const struct k_clock clock_thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, +}; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4d7b2ce09c27..0c0cccfa3586 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -125,8 +125,6 @@ static DEFINE_SPINLOCK(hash_lock); * which we beg off on and pass to do_sys_settimeofday(). */ -static struct k_clock posix_clocks[MAX_CLOCKS]; - /* * These ones are defined below. */ @@ -280,74 +278,87 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } + +static const struct k_clock clock_realtime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic_raw = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, +}; + +static const struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, +}; + +static const struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, +}; + +static const struct k_clock clock_tai = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_boottime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock * const posix_clocks[] = { + [CLOCK_REALTIME] = &clock_realtime, + [CLOCK_MONOTONIC] = &clock_monotonic, + [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, + [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, + [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, + [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, + [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, + [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_REALTIME_ALARM] = &alarm_clock, + [CLOCK_BOOTTIME_ALARM] = &alarm_clock, + [CLOCK_TAI] = &clock_tai, +}; + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { - struct k_clock clock_realtime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic_raw = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, - }; - struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - }; - struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - }; - struct k_clock clock_tai = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - - posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); - posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); - posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); - posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); - posix_timers_register_clock(CLOCK_TAI, &clock_tai); - posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); @@ -521,30 +532,6 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void posix_timers_register_clock(const clockid_t clock_id, - struct k_clock *new_clock) -{ - if ((unsigned) clock_id >= MAX_CLOCKS) { - printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", - clock_id); - return; - } - - if (!new_clock->clock_get) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", - clock_id); - return; - } - if (!new_clock->clock_getres) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", - clock_id); - return; - } - - posix_clocks[clock_id] = *new_clock; -} -EXPORT_SYMBOL_GPL(posix_timers_register_clock); - static struct k_itimer * alloc_posix_timer(void) { struct k_itimer *tmr; @@ -581,15 +568,15 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) call_rcu(&tmr->it.rcu, k_itimer_rcu_free); } -static struct k_clock *clockid_to_kclock(const clockid_t id) +static const struct k_clock *clockid_to_kclock(const clockid_t id) { if (id < 0) return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; - if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) return NULL; - return &posix_clocks[id]; + return posix_clocks[id]; } static int common_timer_create(struct k_itimer *new_timer) @@ -604,7 +591,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; sigevent_t event; @@ -781,7 +768,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec64 cur_setting64; struct itimerspec cur_setting; struct k_itimer *timr; - struct k_clock *kc; + const struct k_clock *kc; unsigned long flags; int ret = 0; @@ -890,7 +877,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, struct itimerspec new_spec, old_spec; struct k_itimer *timr; unsigned long flag; - struct k_clock *kc; + const struct k_clock *kc; int error = 0; if (!new_setting) @@ -939,7 +926,7 @@ static int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - struct k_clock *kc = clockid_to_kclock(timer->it_clock); + const struct k_clock *kc = clockid_to_kclock(timer->it_clock); if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; @@ -1018,7 +1005,7 @@ void exit_itimers(struct signal_struct *sig) SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp64; struct timespec new_tp; @@ -1035,7 +1022,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp64; struct timespec kernel_tp; int error; @@ -1055,7 +1042,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, struct timex __user *, utx) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timex ktx; int err; @@ -1078,7 +1065,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp64; struct timespec rtn_tp; int error; @@ -1110,7 +1097,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t64; struct timespec t; @@ -1136,7 +1123,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, long clock_nanosleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) return -EINVAL; -- cgit v1.2.3 From b6b3b80fceb175c825ad6c72659e0a72e201fc5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 May 2017 12:23:47 +0200 Subject: alarmtimer: Fix posix-timer constification fallout Some freezer related variables are only used when either CONFIG_POSIX_TIMER or CONFIG_RTC_CLASS are enabled. Hide them when both are off. Fixes: d3ba5a9a345b ("posix-timers: Make posix_clocks immutable") Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Christoph Helwig --- kernel/time/alarmtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4f4cc3509b30..4cfebfff848d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -45,11 +45,13 @@ static struct alarm_base { clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; +#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS) /* freezer information to handle clock_nanosleep triggered wakeups */ static enum alarmtimer_type freezer_alarmtype; static ktime_t freezer_expires; static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +#endif static struct wakeup_source *ws; -- cgit v1.2.3 From 613763a1f056211522bac77ff39f25706e678fdd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 May 2017 22:04:29 -0400 Subject: take compat_sys_old_getrlimit() to native syscall ... and sanitize the ifdefs in there Signed-off-by: Al Viro --- kernel/compat.c | 29 ----------------------------- kernel/sys.c | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..860f674fa556 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -468,35 +468,6 @@ COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, return do_prlimit(current, resource, &r, NULL); } -#ifdef COMPAT_RLIM_OLD_INFINITY - -COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r); - set_fs(old_fs); - - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - -#endif - COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..3778a8a417b6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1328,6 +1328,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + + task_lock(current->group_leader); + r = current->signal->rlim[resource]; + task_unlock(current->group_leader); + if (r.rlim_cur > 0x7FFFFFFF) + r.rlim_cur = 0x7FFFFFFF; + if (r.rlim_max > 0x7FFFFFFF) + r.rlim_max = 0x7FFFFFFF; + + if (put_user(r.rlim_cur, &rlim->rlim_cur) || + put_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + return 0; +} +#endif + #endif static inline bool rlim64_is_infinity(__u64 rlim64) -- cgit v1.2.3 From bcfe8ad8ef55a1cf3c935c2667e8e5ae598b3b7e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 27 May 2017 00:29:34 -0400 Subject: do_sigaltstack(): lift copying to/from userland into callers Signed-off-by: Al Viro --- kernel/signal.c | 107 ++++++++++++++++++++++++-------------------------------- 1 file changed, 46 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..d1eed0d7ca64 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3113,78 +3113,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) } static int -do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) +do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) { - stack_t oss; - int error; + struct task_struct *t = current; - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp) | - (current->sas_ss_flags & SS_FLAG_BITS); + if (oss) { + memset(oss, 0, sizeof(stack_t)); + oss->ss_sp = (void __user *) t->sas_ss_sp; + oss->ss_size = t->sas_ss_size; + oss->ss_flags = sas_ss_flags(sp) | + (current->sas_ss_flags & SS_FLAG_BITS); + } - if (uss) { - void __user *ss_sp; - size_t ss_size; - unsigned ss_flags; + if (ss) { + void __user *ss_sp = ss->ss_sp; + size_t ss_size = ss->ss_size; + unsigned ss_flags = ss->ss_flags; int ss_mode; - error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) - goto out; - error = __get_user(ss_sp, &uss->ss_sp) | - __get_user(ss_flags, &uss->ss_flags) | - __get_user(ss_size, &uss->ss_size); - if (error) - goto out; - - error = -EPERM; - if (on_sig_stack(sp)) - goto out; + if (unlikely(on_sig_stack(sp))) + return -EPERM; ss_mode = ss_flags & ~SS_FLAG_BITS; - error = -EINVAL; - if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && - ss_mode != 0) - goto out; + if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && + ss_mode != 0)) + return -EINVAL; if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { - error = -ENOMEM; - if (ss_size < MINSIGSTKSZ) - goto out; + if (unlikely(ss_size < MINSIGSTKSZ)) + return -ENOMEM; } - current->sas_ss_sp = (unsigned long) ss_sp; - current->sas_ss_size = ss_size; - current->sas_ss_flags = ss_flags; - } - - error = 0; - if (uoss) { - error = -EFAULT; - if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) - goto out; - error = __put_user(oss.ss_sp, &uoss->ss_sp) | - __put_user(oss.ss_size, &uoss->ss_size) | - __put_user(oss.ss_flags, &uoss->ss_flags); + t->sas_ss_sp = (unsigned long) ss_sp; + t->sas_ss_size = ss_size; + t->sas_ss_flags = ss_flags; } - -out: - return error; + return 0; } + SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { - return do_sigaltstack(uss, uoss, current_user_stack_pointer()); + stack_t new, old; + int err; + if (uss && copy_from_user(&new, uss, sizeof(stack_t))) + return -EFAULT; + err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, + current_user_stack_pointer()); + if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) + err = -EFAULT; + return err; } int restore_altstack(const stack_t __user *uss) { - int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); + stack_t new; + if (copy_from_user(&new, uss, sizeof(stack_t))) + return -EFAULT; + (void)do_sigaltstack(&new, NULL, current_user_stack_pointer()); /* squash all but EFAULT for now */ - return err == -EFAULT ? err : 0; + return 0; } int __save_altstack(stack_t __user *uss, unsigned long sp) @@ -3207,29 +3197,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, { stack_t uss, uoss; int ret; - mm_segment_t seg; if (uss_ptr) { compat_stack_t uss32; - - memset(&uss, 0, sizeof(stack_t)); if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) return -EFAULT; uss.ss_sp = compat_ptr(uss32.ss_sp); uss.ss_flags = uss32.ss_flags; uss.ss_size = uss32.ss_size; } - seg = get_fs(); - set_fs(KERNEL_DS); - ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), - (stack_t __force __user *) &uoss, + ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, compat_user_stack_pointer()); - set_fs(seg); if (ret >= 0 && uoss_ptr) { - if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || - __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || - __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || - __put_user(uoss.ss_size, &uoss_ptr->ss_size)) + compat_stack_t old; + memset(&old, 0, sizeof(old)); + old.ss_sp = ptr_to_compat(uoss.ss_sp); + old.ss_flags = uoss.ss_flags; + old.ss_size = uoss.ss_size; + if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) ret = -EFAULT; } return ret; -- cgit v1.2.3 From 7c25904508afef134d7a9d2ad1690ee6554a1faa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 May 2017 14:56:50 +0200 Subject: nohz: Reset next_tick cache even when the timer has no regs Handle tick interrupts whose regs are NULL, out of general paranoia. It happens when hrtimer_interrupt() is called from non-interrupt contexts, such as hotplug CPU down events. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 764d2905e6a5..e3043873fcdc 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1202,6 +1202,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (regs) tick_sched_handle(ts, regs); + else + ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) -- cgit v1.2.3 From 7786f6b6dfc12d17eea2df04116de6ebac50c884 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 7 Apr 2017 10:17:27 -0400 Subject: audit: add ambient capabilities to CAPSET and BPRM_FCAPS records Capabilities were augmented to include ambient capabilities in v4.3 commit 58319057b784 ("capabilities: ambient capabilities"). Add ambient capabilities to the audit BPRM_FCAPS and CAPSET records. The record contains fields "old_pp", "old_pi", "old_pe", "new_pp", "new_pi", "new_pe" so in keeping with the previous record normalizations, change the "new_*" variants to simply drop the "new_" prefix. A sample of the replaced BPRM_FCAPS record: RAW: type=BPRM_FCAPS msg=audit(1491468034.252:237): fver=2 fp=0000000000200000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 old_pa=0000000000000000 pp=0000000000200000 pi=0000000000000000 pe=0000000000200000 pa=0000000000000000 INTERPRET: type=BPRM_FCAPS msg=audit(04/06/2017 04:40:34.252:237): fver=2 fp=sys_admin fi=none fe=chown old_pp=none old_pi=none old_pe=none old_pa=none pp=sys_admin pi=none pe=sys_admin pa=none A sample of the replaced CAPSET record: RAW: type=CAPSET msg=audit(1491469502.371:242): pid=833 cap_pi=0000003fffffffff cap_pp=0000003fffffffff cap_pe=0000003fffffffff cap_pa=0000000000000000 INTERPRET: type=CAPSET msg=audit(04/06/2017 05:05:02.371:242) : pid=833 cap_pi=chown,dac_override,dac_read_search,fowner,fsetid,kill, setgid,setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource,sys_time, sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pp=chown,dac_override,dac_read_search,fowner,fsetid,kill,setgid, setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource, sys_time,sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pe=chown,dac_override,dac_read_search,fowner,fsetid,kill,setgid, setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource, sys_time,sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pa=none See: https://github.com/linux-audit/audit-kernel/issues/40 Signed-off-by: Richard Guy Briggs Acked-by: Serge Hallyn Signed-off-by: Paul Moore --- kernel/audit.h | 1 + kernel/auditsc.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index ddfce2ea4891..bb3a4e14b7e5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -68,6 +68,7 @@ struct audit_cap_data { unsigned int fE; /* effective bit of file cap */ kernel_cap_t effective; /* effective set of process */ }; + kernel_cap_t ambient; }; /* When fs/namei.c:getname() is called, we store the pointer in name and bump diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b2dcbe637b7c..5fa68d10032f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1260,6 +1260,7 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); + audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient); break; case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, @@ -1381,9 +1382,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); - audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); - audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); - audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); + audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient); + audit_log_cap(ab, "pp", &axs->new_pcap.permitted); + audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); + audit_log_cap(ab, "pe", &axs->new_pcap.effective); + audit_log_cap(ab, "pa", &axs->new_pcap.ambient); break; } } @@ -2341,10 +2344,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->old_pcap.permitted = old->cap_permitted; ax->old_pcap.inheritable = old->cap_inheritable; ax->old_pcap.effective = old->cap_effective; + ax->old_pcap.ambient = old->cap_ambient; ax->new_pcap.permitted = new->cap_permitted; ax->new_pcap.inheritable = new->cap_inheritable; ax->new_pcap.effective = new->cap_effective; + ax->new_pcap.ambient = new->cap_ambient; return 0; } @@ -2363,6 +2368,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; + context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; } -- cgit v1.2.3 From 71189fa9b092ef125ee741eccb2f5fa916798afd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:27 -0700 Subject: bpf: free up BPF_JMP | BPF_CALL | BPF_X opcode free up BPF_JMP | BPF_CALL | BPF_X opcode to be used by actual indirect call by register and use kernel internal opcode to mark call instruction into bpf_tail_call() helper. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dedf367f59bb..339289402b96 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, - [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, + [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 339c8a1371de..28113d0e8e92 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3469,7 +3469,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * that doesn't support bpf_tail_call yet */ insn->imm = 0; - insn->code |= BPF_X; + insn->code = BPF_JMP | BPF_TAIL_CALL; continue; } -- cgit v1.2.3 From f696b8f471ec987e987e38206b8eb23c39ee5a86 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:28 -0700 Subject: bpf: split bpf core interpreter split __bpf_prog_run() interpreter into stack allocation and execution parts. The code section shrinks which helps interpreter performance in some cases. text data bss dec hex filename 26350 10328 624 37302 91b6 kernel/bpf/core.o.before 25777 10328 624 36729 8f79 kernel/bpf/core.o.after Very short programs got slower (due to extra function call): Before: test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 7 PASS test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 8 PASS test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 7 PASS test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 11 PASS test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 7 PASS After: test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 11 PASS test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 11 PASS test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 11 PASS test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 14 PASS test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 10 PASS Longer programs got faster: Before: test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 20286 20513 PASS test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 31853 31768 PASS test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9815 PASS test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 6 PASS test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13959 PASS test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 210 PASS test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 21724 PASS test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19118 PASS After: test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 19008 18827 PASS test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 29238 28450 PASS test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9485 PASS test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 12 PASS test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13257 PASS test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 213 PASS test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 19389 PASS test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19583 PASS For real world production programs the difference is noise. This patch is first step towards reducing interpreter stack consumption. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 339289402b96..abd410d394bc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, + u64 *stack) { - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG], tmp; + u64 tmp; static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ @@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - select_insn: goto *jumptable[insn->code]; @@ -1219,7 +1216,17 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } -STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ +STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ + +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +{ + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG]; + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + return ___bpf_prog_run(regs, insn, stack); +} bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) -- cgit v1.2.3 From 8726679a0fa317f8e83d0843b266453f31bff092 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:29 -0700 Subject: bpf: teach verifier to track stack depth teach verifier to track bpf program stack depth Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 28113d0e8e92..d96f27ff9f6f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -926,6 +926,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } + + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (t == BPF_WRITE) { if (!env->allow_ptr_leaks && state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && @@ -1032,6 +1036,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -3167,7 +3174,8 @@ process_bpf_exit: insn_idx++; } - verbose("processed %d insns\n", insn_processed); + verbose("processed %d insns, stack depth %d\n", + insn_processed, env->prog->aux->stack_depth); return 0; } -- cgit v1.2.3 From 80a58d02559465b0ea403ff91c8bca9a733b1b0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:30 -0700 Subject: bpf: reconcile bpf_tail_call and stack_depth The next set of patches will take advantage of stack_depth tracking, so make sure that the program that does bpf_tail_call() has stack depth large enough for the callee. We could have tracked the stack depth of the prog_array owner program and only allow insertion of the programs with stack depth less than the owner, but it will break existing applications. Some of them have trivial root bpf program that only does multiple bpf_tail_calls and at init time the prog array is empty. In the future we may add a flag to do such tracking optionally, but for now play simple and safe. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d96f27ff9f6f..14ccb0759fa4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3470,6 +3470,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; + env->prog->aux->stack_depth = MAX_BPF_STACK; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal -- cgit v1.2.3 From b870aa901f4be1d32c13faf9e8f40bf2a8562e19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:33 -0700 Subject: bpf: use different interpreter depending on required stack size 16 __bpf_prog_run() interpreters for various stack sizes add .text but not a lot comparing to run-time stack savings text data bss dec hex filename 26350 10328 624 37302 91b6 kernel/bpf/core.o.before_split 25777 10328 624 36729 8f79 kernel/bpf/core.o.after_split 26970 10328 624 37922 9422 kernel/bpf/core.o.now Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index abd410d394bc..774069ca18a7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1218,16 +1218,38 @@ load_byte: } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ -static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) -{ - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG]; - - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - return ___bpf_prog_run(regs, insn, stack); +#define PROG_NAME(stack_size) __bpf_prog_run##stack_size +#define DEFINE_BPF_PROG_RUN(stack_size) \ +static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + ARG1 = (u64) (unsigned long) ctx; \ + return ___bpf_prog_run(regs, insn, stack); \ } +#define EVAL1(FN, X) FN(X) +#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) +#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) +#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) +#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) +#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) + +EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); + +#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), + +static unsigned int (*interpreters[])(const void *ctx, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1275,7 +1297,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { - fp->bpf_func = (void *) __bpf_prog_run; + fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32]; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during -- cgit v1.2.3 From fb9a307d11d62749d75b404f15517d73f5d6e148 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 31 May 2017 18:15:59 -0700 Subject: bpf: Allow CGROUP_SKB eBPF program to access sk_buff This allows cgroup eBPF program to classify packet based on their protocol or other detail information. Currently program need CAP_NET_ADMIN privilege to attach a cgroup eBPF program, and A process with CAP_NET_ADMIN can already see all packets on the system, for example, by creating an iptables rules that causes the packet to be passed to userspace via NFLOG. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ccb0759fa4..8acae64df255 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2426,6 +2426,7 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; -- cgit v1.2.3 From 80b7d81912d807f161d55e9c2c9cc81061666f83 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 31 May 2017 18:16:00 -0700 Subject: bpf: Remove the capability check for cgroup skb eBPF program Currently loading a cgroup skb eBPF program require a CAP_SYS_ADMIN capability while attaching the program to a cgroup only requires the user have CAP_NET_ADMIN privilege. We can escape the capability check when load the program just like socket filter program to make the capability requirement consistent. Change since v1: Change the code style in order to be compliant with checkpatch.pl preference Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 265a0d854e33..59da103adb85 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -815,7 +815,9 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + if (type != BPF_PROG_TYPE_SOCKET_FILTER && + type != BPF_PROG_TYPE_CGROUP_SKB && + !capable(CAP_SYS_ADMIN)) return -EPERM; /* plain bpf_prog allocation */ -- cgit v1.2.3 From e5aeee51f6b4fb22e851105ee6d8ad211c40a214 Mon Sep 17 00:00:00 2001 From: Alexander Levin Date: Sat, 3 Jun 2017 03:39:13 +0000 Subject: perf/core: Don't release cred_guard_mutex if not taken If we failed to acquire task's cred_guard_mutex we shouldn't proceed to release it in the error path. Fixes: a63fbed776c ("perf/tracing/cpuhotplug: Fix locking order") Signed-off-by: Alexander Levin Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: mhiramat@kernel.org Cc: paulmck@linux.vnet.ibm.com Cc: bigeasy@linutronix.de Link: http://lkml.kernel.org/r/20170603033903.12056-1-alexander.levin@verizon.com Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b97cda4d1777..1f1b8cdaca2d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9878,7 +9878,7 @@ SYSCALL_DEFINE5(perf_event_open, if (task) { err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (err) - goto err_cred; + goto err_task; /* * Reuse ptrace permission checks for now. -- cgit v1.2.3 From 40da1b11f01e43aad1aa6cea64681b6125e8a2a7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 2 Jun 2017 16:27:14 +0200 Subject: cpu/hotplug: Drop the device lock on error If a custom CPU target is specified and that one is not available _or_ can't be interrupted then the code returns to userland without dropping a lock as notices by lockdep: |echo 133 > /sys/devices/system/cpu/cpu7/hotplug/target | ================================================ | [ BUG: lock held when returning to user space! ] | ------------------------------------------------ | bash/503 is leaving the kernel with locks still held! | 1 lock held by bash/503: | #0: (device_hotplug_lock){+.+...}, at: [] lock_device_hotplug_sysfs+0x10/0x40 So release the lock then. Fixes: 757c989b9994 ("cpu/hotplug: Make target state writeable") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170602142714.3ogo25f2wbq6fjpj@linutronix.de --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..cb5103413bd8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1658,13 +1658,13 @@ static ssize_t write_cpuhp_target(struct device *dev, ret = !sp->name || sp->cant_stop ? -EINVAL : 0; mutex_unlock(&cpuhp_state_mutex); if (ret) - return ret; + goto out; if (st->state < target) ret = do_cpu_up(dev->id, target); else ret = do_cpu_down(dev->id, target); - +out: unlock_device_hotplug(); return ret ? ret : count; } -- cgit v1.2.3 From 201d7f47f34bd7cb19161d0426f13b141e381f30 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 May 2017 11:58:32 +0200 Subject: genirq: Handle NOAUTOEN interrupt setup proper If an interrupt is marked NOAUTOEN then request_irq() installs the action, but does not enable the interrupt via startup_irq(). The interrupt is enabled via enable_irq() later from the driver. enable_irq() calls irq_enable(). That means that for interrupts which have a irq_startup() callback this callback is never invoked. Neither is irq_domain_activate_irq() invoked for such interrupts. If an interrupt depends on irq_startup() or irq_domain_activate_irq() then the enable via irq_enable() is not enough. Add a status flag IRQD_IRQ_STARTED_UP and use this to select the proper mechanism in enable_irq(). Use the flag also to avoid pointless calls into the low level functions. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: dianders@chromium.org Cc: jeffy Cc: Brian Norris Cc: tfiga@chromium.org Link: http://lkml.kernel.org/r/20170531100212.130986205@linutronix.de --- kernel/irq/chip.c | 76 +++++++++++++++++++++++++++++++++++++---------------- kernel/irq/manage.c | 12 ++++++--- 2 files changed, 63 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c94da688ee9b..e0051d58c909 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -185,37 +185,64 @@ static void irq_state_set_masked(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } +static void irq_state_clr_started(struct irq_desc *desc) +{ + irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); +} + +static void irq_state_set_started(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); +} + int irq_startup(struct irq_desc *desc, bool resend) { int ret = 0; - irq_state_clr_disabled(desc); desc->depth = 0; - irq_domain_activate_irq(&desc->irq_data); - if (desc->irq_data.chip->irq_startup) { - ret = desc->irq_data.chip->irq_startup(&desc->irq_data); - irq_state_clr_masked(desc); - } else { + if (irqd_is_started(&desc->irq_data)) { irq_enable(desc); + } else { + irq_domain_activate_irq(&desc->irq_data); + if (desc->irq_data.chip->irq_startup) { + ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_state_clr_disabled(desc); + irq_state_clr_masked(desc); + } else { + irq_enable(desc); + } + irq_state_set_started(desc); } + if (resend) check_irq_resend(desc); + return ret; } +static void __irq_disable(struct irq_desc *desc, bool mask); + void irq_shutdown(struct irq_desc *desc) { - irq_state_set_disabled(desc); - desc->depth = 1; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else if (desc->irq_data.chip->irq_disable) - desc->irq_data.chip->irq_disable(&desc->irq_data); - else - desc->irq_data.chip->irq_mask(&desc->irq_data); + if (irqd_is_started(&desc->irq_data)) { + desc->depth = 1; + if (desc->irq_data.chip->irq_shutdown) { + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_state_set_disabled(desc); + irq_state_set_masked(desc); + } else { + __irq_disable(desc, true); + } + irq_state_clr_started(desc); + } + /* + * This must be called even if the interrupt was never started up, + * because the activation can happen before the interrupt is + * available for request/startup. It has it's own state tracking so + * it's safe to call it unconditionally. + */ irq_domain_deactivate_irq(&desc->irq_data); - irq_state_set_masked(desc); } void irq_enable(struct irq_desc *desc) @@ -228,6 +255,17 @@ void irq_enable(struct irq_desc *desc) irq_state_clr_masked(desc); } +static void __irq_disable(struct irq_desc *desc, bool mask) +{ + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } else if (mask) { + mask_irq(desc); + } +} + /** * irq_disable - Mark interrupt disabled * @desc: irq descriptor which should be disabled @@ -250,13 +288,7 @@ void irq_enable(struct irq_desc *desc) */ void irq_disable(struct irq_desc *desc) { - irq_state_set_disabled(desc); - if (desc->irq_data.chip->irq_disable) { - desc->irq_data.chip->irq_disable(&desc->irq_data); - irq_state_set_masked(desc); - } else if (irq_settings_disable_unlazy(desc)) { - mask_irq(desc); - } + __irq_disable(desc, irq_settings_disable_unlazy(desc)); } void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 070be980c37a..57056109f176 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -533,9 +533,15 @@ void __enable_irq(struct irq_desc *desc) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); - irq_enable(desc); - check_irq_resend(desc); - /* fall-through */ + /* + * Call irq_startup() not irq_enable() here because the + * interrupt might be marked NOAUTOEN. So irq_startup() + * needs to be invoked when it gets enabled the first + * time. If it was already started up, then irq_startup() + * will invoke irq_enable() under the hood. + */ + irq_startup(desc, true); + break; } default: desc->depth--; -- cgit v1.2.3 From 04c848d398797a626608ff48804d809ae6687163 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 May 2017 11:58:33 +0200 Subject: genirq: Warn when IRQ_NOAUTOEN is used with shared interrupts Shared interrupts do not go well with disabling auto enable: 1) The sharing interrupt might request it while it's still disabled and then wait for interrupts forever. 2) The interrupt might have been requested by the driver sharing the line before IRQ_NOAUTOEN has been set. So the driver which expects that disabled state after calling request_irq() will not get what it wants. Even worse, when it calls enable_irq() later, it will trigger the unbalanced enable_irq() warning. Reported-by: Brian Norris Signed-off-by: Thomas Gleixner Cc: dianders@chromium.org Cc: jeffy Cc: Marc Zyngier Cc: tfiga@chromium.org Link: http://lkml.kernel.org/r/20170531100212.210682135@linutronix.de --- kernel/irq/chip.c | 7 +++++++ kernel/irq/manage.c | 12 ++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e0051d58c909..bc1331f84fb5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -935,6 +935,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (!desc) return; + + /* + * Warn when a driver sets the no autoenable flag on an already + * active interrupt. + */ + WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); + irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 57056109f176..49c37f1e71c0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1334,11 +1334,19 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; - if (irq_settings_can_autoenable(desc)) + if (irq_settings_can_autoenable(desc)) { irq_startup(desc, true); - else + } else { + /* + * Shared interrupts do not go well with disabling + * auto enable. The sharing interrupt might request + * it while it's still disabled and then wait for + * interrupts forever. + */ + WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; + } /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { -- cgit v1.2.3 From 31ea70e0308b73a1b862bd17c06efc3cbcfd2016 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 21:01:00 +0200 Subject: posix-timers: Move the do_schedule_next_timer declaration Having it in asm-generic/siginfo.h doesn't make any sense as it is in no way architecture specific. Move it to posix-timers.h instead. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Fenghua Yu Cc: Tony Luck Cc: linux-ia64@vger.kernel.org Cc: Arnd Bergmann Cc: sparclinux@vger.kernel.org Cc: "David S. Miller" Link: http://lkml.kernel.org/r/20170603190102.28866-4-hch@lst.de --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..1f85c843be8e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -39,6 +39,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include -- cgit v1.2.3 From f4781e76f90df7aec400635d73ea4c35ee1d4765 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:34 +0200 Subject: alarmtimer: Prevent overflow of relative timers Andrey reported a alartimer related RCU stall while fuzzing the kernel with syzkaller. The reason for this is an overflow in ktime_add() which brings the resulting time into negative space and causes immediate expiry of the timer. The following rearm with a small interval does not bring the timer back into positive space due to the same issue. This results in a permanent firing alarmtimer which hogs the CPU. Use ktime_add_safe() instead which detects the overflow and clamps the result to KTIME_SEC_MAX. Reported-by: Andrey Konovalov Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Kostya Serebryany Cc: syzkaller Cc: John Stultz Cc: Dmitry Vyukov Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170530211655.802921648@linutronix.de --- kernel/time/alarmtimer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5cb5b0008d97..2b2e032b4eb4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -387,7 +387,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add(start, base->gettime()); + start = ktime_add_safe(start, base->gettime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -475,7 +475,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) overrun++; } - alarm->node.expires = ktime_add(alarm->node.expires, interval); + alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); return overrun; } EXPORT_SYMBOL_GPL(alarm_forward); @@ -666,7 +666,7 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, ktime_t now; now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); - exp = ktime_add(now, exp); + exp = ktime_add_safe(now, exp); } alarm_start(&timr->it.alarm.alarmtimer, exp); -- cgit v1.2.3 From ff86bf0c65f14346bf2440534f9ba5ac232c39a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:35 +0200 Subject: alarmtimer: Rate limit periodic intervals The alarmtimer code has another source of potentially rearming itself too fast. Interval timers with a very samll interval have a similar CPU hog effect as the previously fixed overflow issue. The reason is that alarmtimers do not implement the normal protection against this kind of problem which the other posix timer use: timer expires -> queue signal -> deliver signal -> rearm timer This scheme brings the rearming under scheduler control and prevents permanently firing timers which hog the CPU. Bringing this scheme to the alarm timer code is a major overhaul because it lacks all the necessary mechanisms completely. So for a quick fix limit the interval to one jiffie. This is not problematic in practice as alarmtimers are usually backed by an RTC for suspend which have 1 second resolution. It could be therefor argued that the resolution of this clock should be set to 1 second in general, but that's outside the scope of this fix. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Kostya Serebryany Cc: syzkaller Cc: John Stultz Cc: Dmitry Vyukov Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170530211655.896767100@linutronix.de --- kernel/time/alarmtimer.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2b2e032b4eb4..ee2f4202d82a 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -660,6 +660,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, /* start the timer */ timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + + /* + * Rate limit to the tick as a hot fix to prevent DOS. Will be + * mopped up later. + */ + if (timr->it.alarm.interval < TICK_NSEC) + timr->it.alarm.interval = TICK_NSEC; + exp = timespec64_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { -- cgit v1.2.3 From 18c700c4e3d0a37c43a2df8b8f740121d4dac645 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:36 +0200 Subject: alarmtimer: Remove pointless config conditional Having a IF_ENABLED(CONFIG_POSIX_TIMERS) inside of a #ifdef CONFIG_POSIX_TIMERS section is pointless. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211655.975218056@linutronix.de --- kernel/time/alarmtimer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e645dcc7d4ee..2a8675f9aac5 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -520,8 +520,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (IS_ENABLED(CONFIG_POSIX_TIMERS) && - posix_timer_event(ptr, 0) != 0) + if (posix_timer_event(ptr, 0)) ptr->it_overrun++; } -- cgit v1.2.3 From a81129e5a189973abd661704b261f8aad9325407 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:37 +0200 Subject: posix-timers: Remove unused export of posix_timer_event() Since the removal of the mmtimer driver the export is not longer needed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.052744418@linutronix.de --- kernel/time/posix-timers.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0c0cccfa3586..44d486590e6e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -442,7 +442,6 @@ int posix_timer_event(struct k_itimer *timr, int si_private) /* If we failed to send the signal the timer stops. */ return ret > 0; } -EXPORT_SYMBOL_GPL(posix_timer_event); /* * This function gets called when a POSIX.1b interval timer expires. It -- cgit v1.2.3 From 3a06c7ac24f9f24ec059cd77c2dbdf7fbfd0aaaf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:38 +0200 Subject: posix-clocks: Remove interval timer facility and mmap/fasync callbacks The only user of this facility is ptp_clock, which does not implement any of those functions. Remove them to prevent accidental users. Especially the interval timer interfaces are now more or less impossible to implement because the necessary infrastructure has been confined to the core code. Aside of that it's really complex to make these callbacks implemented according to spec as the alarm timer implementation demonstrates. If at all then a nanosleep callback might be a reasonable extension. For now keep just what ptp_clock needs. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.145036286@linutronix.de --- kernel/time/posix-clock.c | 113 ---------------------------------------------- 1 file changed, 113 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 7e453005e078..bd4fb785652f 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -82,38 +82,6 @@ static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) return result; } -static int posix_clock_fasync(int fd, struct file *fp, int on) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = 0; - - if (!clk) - return -ENODEV; - - if (clk->ops.fasync) - err = clk->ops.fasync(clk, fd, fp, on); - - put_posix_clock(clk); - - return err; -} - -static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENODEV; - - if (!clk) - return -ENODEV; - - if (clk->ops.mmap) - err = clk->ops.mmap(clk, vma); - - put_posix_clock(clk); - - return err; -} - static long posix_clock_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { @@ -199,8 +167,6 @@ static const struct file_operations posix_clock_file_operations = { .unlocked_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, - .fasync = posix_clock_fasync, - .mmap = posix_clock_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = posix_clock_compat_ioctl, #endif @@ -359,88 +325,9 @@ out: return err; } -static int pc_timer_create(struct k_itimer *kit) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_create) - err = cd.clk->ops.timer_create(cd.clk, kit); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static int pc_timer_delete(struct k_itimer *kit) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_delete) - err = cd.clk->ops.timer_delete(cd.clk, kit); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - - if (get_clock_desc(id, &cd)) - return; - - if (cd.clk->ops.timer_gettime) - cd.clk->ops.timer_gettime(cd.clk, kit, ts); - - put_clock_desc(&cd); -} - -static int pc_timer_settime(struct k_itimer *kit, int flags, - struct itimerspec64 *ts, struct itimerspec64 *old) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_settime) - err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - const struct k_clock clock_posix_dynamic = { .clock_getres = pc_clock_getres, .clock_set = pc_clock_settime, .clock_get = pc_clock_gettime, .clock_adj = pc_clock_adjtime, - .timer_create = pc_timer_create, - .timer_set = pc_timer_settime, - .timer_del = pc_timer_delete, - .timer_get = pc_timer_gettime, }; -- cgit v1.2.3 From 6631fa12c105e326bbe5fb215eb216e86c90d1ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:39 +0200 Subject: posix-timers: Avoid gazillions of forward declarations Move it below the actual implementations as there are new callbacks coming which would require even more forward declarations. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.238209952@linutronix.de --- kernel/time/posix-timers.c | 190 +++++++++++++++++++++------------------------ 1 file changed, 89 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 44d486590e6e..b60b655dfbcd 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -69,6 +69,9 @@ static struct kmem_cache *posix_timers_cache; static DEFINE_HASHTABLE(posix_timers_hashtable, 9); static DEFINE_SPINLOCK(hash_lock); +static const struct k_clock * const posix_clocks[]; +static const struct k_clock *clockid_to_kclock(const clockid_t id); + /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other * SIGEV values. Here we put out an error if this assumption fails. @@ -124,20 +127,6 @@ static DEFINE_SPINLOCK(hash_lock); * have is CLOCK_REALTIME and its high res counter part, both of * which we beg off on and pass to do_sys_settimeofday(). */ - -/* - * These ones are defined below. - */ -static int common_nsleep(const clockid_t, int flags, struct timespec64 *t, - struct timespec __user *rmtp); -static int common_timer_create(struct k_itimer *new_timer); -static void common_timer_get(struct k_itimer *, struct itimerspec64 *); -static int common_timer_set(struct k_itimer *, int, - struct itimerspec64 *, struct itimerspec64 *); -static int common_timer_del(struct k_itimer *timer); - -static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); - static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); #define lock_timer(tid, flags) \ @@ -278,82 +267,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } - -static const struct k_clock clock_realtime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, -}; - -static const struct k_clock clock_monotonic = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, -}; - -static const struct k_clock clock_monotonic_raw = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, -}; - -static const struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, -}; - -static const struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, -}; - -static const struct k_clock clock_tai = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, -}; - -static const struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, -}; - -static const struct k_clock * const posix_clocks[] = { - [CLOCK_REALTIME] = &clock_realtime, - [CLOCK_MONOTONIC] = &clock_monotonic, - [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, - [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, - [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, - [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, - [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, - [CLOCK_BOOTTIME] = &clock_boottime, - [CLOCK_REALTIME_ALARM] = &alarm_clock, - [CLOCK_BOOTTIME_ALARM] = &alarm_clock, - [CLOCK_TAI] = &clock_tai, -}; - /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -567,17 +480,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) call_rcu(&tmr->it.rcu, k_itimer_rcu_free); } -static const struct k_clock *clockid_to_kclock(const clockid_t id) -{ - if (id < 0) - return (id & CLOCKFD_MASK) == CLOCKFD ? - &clock_posix_dynamic : &clock_posix_cpu; - - if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) - return NULL; - return posix_clocks[id]; -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -1129,3 +1031,89 @@ long clock_nanosleep_restart(struct restart_block *restart_block) return kc->nsleep_restart(restart_block); } + +static const struct k_clock clock_realtime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic_raw = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, +}; + +static const struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, +}; + +static const struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, +}; + +static const struct k_clock clock_tai = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_boottime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock * const posix_clocks[] = { + [CLOCK_REALTIME] = &clock_realtime, + [CLOCK_MONOTONIC] = &clock_monotonic, + [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, + [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, + [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, + [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, + [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, + [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_REALTIME_ALARM] = &alarm_clock, + [CLOCK_BOOTTIME_ALARM] = &alarm_clock, + [CLOCK_TAI] = &clock_tai, +}; + +static const struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) + return NULL; + return posix_clocks[id]; +} -- cgit v1.2.3 From bab0aae9dcba9466dcc968b8bd21914f8f691631 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:41 +0200 Subject: posix-timers: Move posix-timer internals to core None of these declarations is required outside of kernel/time. Move them to an internal header. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170530211656.394803853@linutronix.de --- kernel/time/alarmtimer.c | 2 ++ kernel/time/posix-clock.c | 2 ++ kernel/time/posix-cpu-timers.c | 2 ++ kernel/time/posix-timers.c | 1 + kernel/time/posix-timers.h | 29 +++++++++++++++++++++++++++++ 5 files changed, 36 insertions(+) create mode 100644 kernel/time/posix-timers.h (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2a8675f9aac5..36855d675da5 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -28,6 +28,8 @@ #include #include +#include "posix-timers.h" + #define CREATE_TRACE_POINTS #include diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index bd4fb785652f..17cdc554c9fe 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -25,6 +25,8 @@ #include #include +#include "posix-timers.h" + static void delete_clock(struct kref *kref); /* diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index c99434739fd5..a77a792f2570 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -13,6 +13,8 @@ #include #include +#include "posix-timers.h" + /* * Called after updating RLIMIT_CPU to run cpu timer and update * tsk->signal->cputime_expires expiration cache if necessary. Needs diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b60b655dfbcd..dee6a0d911d4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -51,6 +51,7 @@ #include #include "timekeeping.h" +#include "posix-timers.h" /* * Management arrays for POSIX timers. Timers are now kept in static hash table diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h new file mode 100644 index 000000000000..ad2dbd29b389 --- /dev/null +++ b/kernel/time/posix-timers.h @@ -0,0 +1,29 @@ +#define TIMER_RETRY 1 + +struct k_clock { + int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp); + int (*clock_set) (const clockid_t which_clock, + const struct timespec64 *tp); + int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp); + int (*clock_adj) (const clockid_t which_clock, struct timex *tx); + int (*timer_create) (struct k_itimer *timer); + int (*nsleep) (const clockid_t which_clock, int flags, + struct timespec64 *, struct timespec __user *); + long (*nsleep_restart) (struct restart_block *restart_block); + int (*timer_set) (struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); + int (*timer_del) (struct k_itimer *timr); + void (*timer_get) (struct k_itimer *timr, + struct itimerspec64 *cur_setting); +}; + +extern const struct k_clock clock_posix_cpu; +extern const struct k_clock clock_posix_dynamic; +extern const struct k_clock clock_process; +extern const struct k_clock clock_thread; +extern const struct k_clock alarm_clock; + +int posix_timer_event(struct k_itimer *timr, int si_private); + +void posix_cpu_timer_schedule(struct k_itimer *timer); -- cgit v1.2.3 From af888d677a3f4473c198b4720319dd037f398b51 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:42 +0200 Subject: posix-timers: Unify overrun/requeue_pending handling hrtimer based posix-timers and posix-cpu-timers handle the update of the rearming and overflow related status fields differently. Move that update to the common rearming code. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.484936964@linutronix.de --- kernel/time/posix-cpu-timers.c | 18 +++++++----------- kernel/time/posix-timers.c | 15 ++++++++------- 2 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a77a792f2570..1683e503179e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -527,6 +527,7 @@ static void cpu_timer_fire(struct k_itimer *timer) * ticking in case the signal is deliverable next time. */ posix_cpu_timer_schedule(timer); + ++timer->it_requeue_pending; } } @@ -997,12 +998,12 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) cpu_clock_sample(timer->it_clock, p, &now); bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) - goto out; + return; /* Protect timer list r/w in arm_timer() */ sighand = lock_task_sighand(p, &flags); if (!sighand) - goto out; + return; } else { /* * Protect arm_timer() and timer sampling in case of call to @@ -1015,11 +1016,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * We can't even collect a sample any more. */ timer->it.cpu.expires = 0; - goto out; + return; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - unlock_task_sighand(p, &flags); - /* Optimizations: if the process is dying, no need to rearm */ - goto out; + /* If the process is dying, no need to rearm */ + goto unlock; } cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); @@ -1031,12 +1031,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ WARN_ON_ONCE(!irqs_disabled()); arm_timer(timer); +unlock: unlock_task_sighand(p, &flags); - -out: - timer->it_overrun_last = timer->it_overrun; - timer->it_overrun = -1; - ++timer->it_requeue_pending; } /** diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dee6a0d911d4..79a00e0f1ef9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -291,10 +291,6 @@ static void schedule_next_timer(struct k_itimer *timr) timr->it_overrun += (unsigned int) hrtimer_forward(timer, timer->base->get_time(), timr->it.real.interval); - - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; - ++timr->it_requeue_pending; hrtimer_restart(timer); } @@ -315,18 +311,23 @@ void do_schedule_next_timer(struct siginfo *info) unsigned long flags; timr = lock_timer(info->si_tid, &flags); + if (!timr) + return; - if (timr && timr->it_requeue_pending == info->si_sys_private) { + if (timr->it_requeue_pending == info->si_sys_private) { if (timr->it_clock < 0) posix_cpu_timer_schedule(timr); else schedule_next_timer(timr); + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1; + ++timr->it_requeue_pending; + info->si_overrun += timr->it_overrun_last; } - if (timr) - unlock_timer(timr, flags); + unlock_timer(timr, flags); } int posix_timer_event(struct k_itimer *timr, int si_private) -- cgit v1.2.3 From 80105cd0e62ba8a2caf8eebd52f42952c7c04046 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:43 +0200 Subject: posix-timers: Move interval out of the union Preparatory patch to unify the alarm timer and hrtimer based posix interval timer handling. The interval is used as a criteria for rearming decisions so moving it out of the clock specific data structures allows later unification. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.563922908@linutronix.de --- kernel/time/alarmtimer.c | 13 ++++++------- kernel/time/posix-timers.c | 20 ++++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 36855d675da5..5b8cf4b61854 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -527,9 +527,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, } /* Re-add periodic timers */ - if (ptr->it.alarm.interval) { - ptr->it_overrun += alarm_forward(alarm, now, - ptr->it.alarm.interval); + if (ptr->it_interval) { + ptr->it_overrun += alarm_forward(alarm, now, ptr->it_interval); result = ALARMTIMER_RESTART; } spin_unlock_irqrestore(&ptr->it_lock, flags); @@ -613,7 +612,7 @@ static void alarm_timer_get(struct k_itimer *timr, cur_setting->it_value.tv_nsec = 0; } - cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval); + cur_setting->it_interval = ktime_to_timespec64(timr->it_interval); } /** @@ -662,14 +661,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, return TIMER_RETRY; /* start the timer */ - timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + timr->it_interval = timespec64_to_ktime(new_setting->it_interval); /* * Rate limit to the tick as a hot fix to prevent DOS. Will be * mopped up later. */ - if (timr->it.alarm.interval < TICK_NSEC) - timr->it.alarm.interval = TICK_NSEC; + if (timr->it_interval < TICK_NSEC) + timr->it_interval = TICK_NSEC; exp = timespec64_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 79a00e0f1ef9..7dd992cc7105 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -285,12 +285,12 @@ static void schedule_next_timer(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - if (timr->it.real.interval == 0) + if (!timr->it_interval) return; timr->it_overrun += (unsigned int) hrtimer_forward(timer, timer->base->get_time(), - timr->it.real.interval); + timr->it_interval); hrtimer_restart(timer); } @@ -375,7 +375,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); - if (timr->it.real.interval != 0) + if (timr->it_interval != 0) si_private = ++timr->it_requeue_pending; if (posix_timer_event(timr, si_private)) { @@ -384,7 +384,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * we will not get a call back to restart it AND * it should be restarted. */ - if (timr->it.real.interval != 0) { + if (timr->it_interval != 0) { ktime_t now = hrtimer_cb_get_time(timer); /* @@ -413,13 +413,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { ktime_t kj = NSEC_PER_SEC / HZ; - if (timr->it.real.interval < kj) + if (timr->it_interval < kj) now = ktime_add(now, kj); } #endif timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, - timr->it.real.interval); + timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; } @@ -631,7 +631,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) memset(cur_setting, 0, sizeof(*cur_setting)); - iv = timr->it.real.interval; + iv = timr->it_interval; /* interval timer ? */ if (iv) @@ -732,7 +732,7 @@ common_timer_set(struct k_itimer *timr, int flags, common_timer_get(timr, old_setting); /* disable the timer */ - timr->it.real.interval = 0; + timr->it_interval = 0; /* * careful here. If smp we could be in the "fire" routine which will * be spinning as we hold the lock. But this is ONLY an SMP issue. @@ -755,7 +755,7 @@ common_timer_set(struct k_itimer *timr, int flags, hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); /* Convert interval */ - timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval); + timr->it_interval = timespec64_to_ktime(new_setting->it_interval); /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { @@ -820,7 +820,7 @@ retry: static int common_timer_del(struct k_itimer *timer) { - timer->it.real.interval = 0; + timer->it_interval = 0; if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) return TIMER_RETRY; -- cgit v1.2.3 From d97bb75ddd2f38068df01da8abf26df78756253c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:44 +0200 Subject: posix-timers: Store k_clock pointer in k_itimer Having the k_clock pointer in the k_itimer struct avoids the lookup in several code pathes and makes the next steps of unification of the hrtimer and alarmtimer based posix timers simpler. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.641222072@linutronix.de --- kernel/time/posix-cpu-timers.c | 2 ++ kernel/time/posix-timers.c | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1683e503179e..0123ece6851b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -324,6 +324,8 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) return -EINVAL; + new_timer->kclock = &clock_posix_cpu; + INIT_LIST_HEAD(&new_timer->it.cpu.entry); rcu_read_lock(); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 7dd992cc7105..eb007e19811d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -519,6 +519,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; + new_timer->kclock = kc; new_timer->it_overrun = -1; if (timer_event_spec) { @@ -679,7 +680,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (!timr) return -EINVAL; - kc = clockid_to_kclock(timr->it_clock); + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else @@ -798,7 +799,7 @@ retry: if (!timr) return -EINVAL; - kc = clockid_to_kclock(timr->it_clock); + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else @@ -829,7 +830,7 @@ static int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - const struct k_clock *kc = clockid_to_kclock(timer->it_clock); + const struct k_clock *kc = timer->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; -- cgit v1.2.3 From 30802945893bc944b5971b408b37511a03b54e5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:45 +0200 Subject: posix-timers: Add timer_rearm() callback Add a timer_rearm() callback which is used to make the rescheduling of posix interval timers independent of the underlying clock implementation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.732632167@linutronix.de --- kernel/time/posix-timers.h | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index ad2dbd29b389..02ffd1b9d230 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -1,21 +1,24 @@ #define TIMER_RETRY 1 struct k_clock { - int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp); - int (*clock_set) (const clockid_t which_clock, - const struct timespec64 *tp); - int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp); - int (*clock_adj) (const clockid_t which_clock, struct timex *tx); - int (*timer_create) (struct k_itimer *timer); - int (*nsleep) (const clockid_t which_clock, int flags, - struct timespec64 *, struct timespec __user *); - long (*nsleep_restart) (struct restart_block *restart_block); - int (*timer_set) (struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting); - int (*timer_del) (struct k_itimer *timr); - void (*timer_get) (struct k_itimer *timr, - struct itimerspec64 *cur_setting); + int (*clock_getres)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_set)(const clockid_t which_clock, + const struct timespec64 *tp); + int (*clock_get)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_adj)(const clockid_t which_clock, struct timex *tx); + int (*timer_create)(struct k_itimer *timer); + int (*nsleep)(const clockid_t which_clock, int flags, + struct timespec64 *, struct timespec __user *); + long (*nsleep_restart)(struct restart_block *restart_block); + int (*timer_set)(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); + int (*timer_del)(struct k_itimer *timr); + void (*timer_get)(struct k_itimer *timr, + struct itimerspec64 *cur_setting); + void (*timer_rearm)(struct k_itimer *timr); }; extern const struct k_clock clock_posix_cpu; -- cgit v1.2.3 From 96fe3b072f134e4993f829d599eaa1e0eb5a10e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:46 +0200 Subject: posix-timers: Rename do_schedule_next_timer That function is a misnomer. Rename it with a proper prefix to posixtimer_rearm(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.811362578@linutronix.de --- kernel/signal.c | 2 +- kernel/time/posix-cpu-timers.c | 2 +- kernel/time/posix-timers.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1f85c843be8e..d031cd24f8a9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -630,7 +630,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * about to disable them again anyway. */ spin_unlock(&tsk->sighand->siglock); - do_schedule_next_timer(info); + posixtimer_rearm(info); spin_lock(&tsk->sighand->siglock); } #endif diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0123ece6851b..1ba576d3151a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -981,7 +981,7 @@ static void check_process_timers(struct task_struct *tsk, } /* - * This is called from the signal code (via do_schedule_next_timer) + * This is called from the signal code (via posixtimer_rearm) * when the last timer signal was delivered and we have to reload the timer. */ void posix_cpu_timer_schedule(struct k_itimer *timer) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index eb007e19811d..036b7e70c65c 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -305,7 +305,7 @@ static void schedule_next_timer(struct k_itimer *timr) * To protect against the timer going away while the interrupt is queued, * we require that the it_requeue_pending flag be set. */ -void do_schedule_next_timer(struct siginfo *info) +void posixtimer_rearm(struct siginfo *info) { struct k_itimer *timr; unsigned long flags; @@ -336,12 +336,12 @@ int posix_timer_event(struct k_itimer *timr, int si_private) int shared, ret = -1; /* * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->do_schedule_next_timer(). + * dequeue_signal()->posixtimer_rearm(). * * If dequeue_signal() sees the "right" value of - * si_sys_private it calls do_schedule_next_timer(). + * si_sys_private it calls posixtimer_rearm(). * We re-queue ->sigq and drop ->it_lock(). - * do_schedule_next_timer() locks the timer + * posixtimer_rearm() locks the timer * and re-schedules it while ->sigq is pending. * Not really bad, but not that we want. */ @@ -701,7 +701,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, * accumulating overruns on the next timer. The overrun is frozen when * the signal is delivered, either at the notify time (if the info block * is not queued) or at the actual delivery time (as we are informed by - * the call back to do_schedule_next_timer(). So all we need to do is + * the call back to posixtimer_rearm(). So all we need to do is * to pick up the frozen overrun. */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) -- cgit v1.2.3 From f37fb0aa4f453c7c785bbcecc4991ac48c5c0e51 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:47 +0200 Subject: posix-timers: Use timer_rearm() callback in posixtimer_rearm() Use the new timer_rearm() callback to replace the conditional hardcoded calls into the hrtimer and cpu timer code. This allows later to bring the same logic to alarmtimers. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.889661919@linutronix.de --- kernel/time/posix-cpu-timers.c | 7 +++++-- kernel/time/posix-timers.c | 12 ++++++------ kernel/time/posix-timers.h | 2 -- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1ba576d3151a..96c833a61ade 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -15,6 +15,8 @@ #include "posix-timers.h" +static void posix_cpu_timer_rearm(struct k_itimer *timer); + /* * Called after updating RLIMIT_CPU to run cpu timer and update * tsk->signal->cputime_expires expiration cache if necessary. Needs @@ -528,7 +530,7 @@ static void cpu_timer_fire(struct k_itimer *timer) * reload the timer. But we need to keep it * ticking in case the signal is deliverable next time. */ - posix_cpu_timer_schedule(timer); + posix_cpu_timer_rearm(timer); ++timer->it_requeue_pending; } } @@ -984,7 +986,7 @@ static void check_process_timers(struct task_struct *tsk, * This is called from the signal code (via posixtimer_rearm) * when the last timer signal was delivered and we have to reload the timer. */ -void posix_cpu_timer_schedule(struct k_itimer *timer) +static void posix_cpu_timer_rearm(struct k_itimer *timer) { struct sighand_struct *sighand; unsigned long flags; @@ -1431,6 +1433,7 @@ const struct k_clock clock_posix_cpu = { .timer_set = posix_cpu_timer_set, .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, }; const struct k_clock clock_process = { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 036b7e70c65c..b12582a4b122 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -278,10 +278,9 @@ static __init int init_posix_timers(void) NULL); return 0; } - __initcall(init_posix_timers); -static void schedule_next_timer(struct k_itimer *timr) +static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; @@ -315,10 +314,7 @@ void posixtimer_rearm(struct siginfo *info) return; if (timr->it_requeue_pending == info->si_sys_private) { - if (timr->it_clock < 0) - posix_cpu_timer_schedule(timr); - else - schedule_next_timer(timr); + timr->kclock->timer_rearm(timr); timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1; @@ -1046,6 +1042,7 @@ static const struct k_clock clock_realtime = { .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, }; static const struct k_clock clock_monotonic = { @@ -1057,6 +1054,7 @@ static const struct k_clock clock_monotonic = { .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, }; static const struct k_clock clock_monotonic_raw = { @@ -1083,6 +1081,7 @@ static const struct k_clock clock_tai = { .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, }; static const struct k_clock clock_boottime = { @@ -1094,6 +1093,7 @@ static const struct k_clock clock_boottime = { .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, }; static const struct k_clock * const posix_clocks[] = { diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 02ffd1b9d230..1f6f6f9a6a37 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -28,5 +28,3 @@ extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; int posix_timer_event(struct k_itimer *timr, int si_private); - -void posix_cpu_timer_schedule(struct k_itimer *timer); -- cgit v1.2.3 From 21e55c1f83880a56360287c00f2b5cd5e5a4a912 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:48 +0200 Subject: posix-timers: Add active flag to k_itimer Keep track of the activation state of posix timers. This is a preparatory change for making common_timer_get() usable by both hrtimer and alarm timer implementations. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.967783982@linutronix.de --- kernel/time/posix-timers.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b12582a4b122..795215bba73d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -316,6 +316,7 @@ void posixtimer_rearm(struct siginfo *info) if (timr->it_requeue_pending == info->si_sys_private) { timr->kclock->timer_rearm(timr); + timr->it_active = 1; timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1; ++timr->it_requeue_pending; @@ -371,6 +372,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); + timr->it_active = 0; if (timr->it_interval != 0) si_private = ++timr->it_requeue_pending; @@ -418,6 +420,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; + timr->it_active = 1; } } @@ -737,7 +740,8 @@ common_timer_set(struct k_itimer *timr, int flags, if (hrtimer_try_to_cancel(timer) < 0) return TIMER_RETRY; - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + timr->it_active = 0; + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & ~REQUEUE_PENDING; timr->it_overrun_last = 0; @@ -763,6 +767,7 @@ common_timer_set(struct k_itimer *timr, int flags, return 0; } + timr->it_active = 1; hrtimer_start_expires(timer, mode); return 0; } @@ -821,6 +826,7 @@ static int common_timer_del(struct k_itimer *timer) if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) return TIMER_RETRY; + timer->it_active = 0; return 0; } -- cgit v1.2.3 From 63841b2a6969501de183efafc14d20175e402804 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:49 +0200 Subject: posix-timers: Add forward/remaining callbacks Add two callbacks to kclock which allow using common_)timer_get() for both hrtimer and alarm timer based clocks. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.044915536@linutronix.de --- kernel/time/posix-timers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 1f6f6f9a6a37..3bc5b74c342f 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -19,6 +19,8 @@ struct k_clock { void (*timer_get)(struct k_itimer *timr, struct itimerspec64 *cur_setting); void (*timer_rearm)(struct k_itimer *timr); + int (*timer_forward)(struct k_itimer *timr, ktime_t now); + ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); }; extern const struct k_clock clock_posix_cpu; -- cgit v1.2.3 From 91d57bae08689199c8acc77a8b3b41150cafab1c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:50 +0200 Subject: posix-timers: Make use of forward/remaining callbacks Replace the hrtimer calls by calls to the new forward/remaining kclock callbacks and move the hrtimer specific implementation into the corresponding callback functions. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.121437232@linutronix.de --- kernel/time/posix-timers.c | 64 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 795215bba73d..48f6c37ae5df 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -607,6 +607,20 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) return NULL; } +static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return __hrtimer_expires_remaining_adjusted(timer, now); +} + +static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return (int)hrtimer_forward(timer, now, timr->it_interval); +} + /* * Get the time remaining on a POSIX.1b interval timer. This function * is ALWAYS called with spin_lock_irq on the timer, thus it must not @@ -626,42 +640,54 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) static void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { + const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct hrtimer *timer = &timr->it.real.timer; + struct timespec64 ts64; + bool sig_none; memset(cur_setting, 0, sizeof(*cur_setting)); + sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ - if (iv) + if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); - else if (!hrtimer_active(timer) && - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - return; + } else if (!timr->it_active) { + /* + * SIGEV_NONE oneshot timers are never queued. Check them + * below. + */ + if (!sig_none) + return; + } - now = timer->base->get_time(); + /* + * The timespec64 based conversion is suboptimal, but it's not + * worth to implement yet another callback. + */ + kc->clock_get(timr->it_clock, &ts64); + now = timespec64_to_ktime(ts64); /* - * When a requeue is pending or this is a SIGEV_NONE - * timer move the expiry time forward by intervals, so - * expiry is > now. + * When a requeue is pending or this is a SIGEV_NONE timer move the + * expiry time forward by intervals, so expiry is > now. */ - if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + timr->it_overrun += kc->timer_forward(timr, now); - remaining = __hrtimer_expires_remaining_adjusted(timer, now); + remaining = kc->timer_remaining(timr, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining <= 0) { /* * A single shot SIGEV_NONE timer must return 0, when * it is expired ! */ - if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + if (!sig_none) cur_setting->it_value.tv_nsec = 1; - } else + } else { cur_setting->it_value = ktime_to_timespec64(remaining); + } } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -1049,6 +1075,8 @@ static const struct k_clock clock_realtime = { .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining= common_hrtimer_remaining, }; static const struct k_clock clock_monotonic = { @@ -1061,6 +1089,8 @@ static const struct k_clock clock_monotonic = { .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining= common_hrtimer_remaining, }; static const struct k_clock clock_monotonic_raw = { @@ -1088,6 +1118,8 @@ static const struct k_clock clock_tai = { .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining= common_hrtimer_remaining, }; static const struct k_clock clock_boottime = { @@ -1100,6 +1132,8 @@ static const struct k_clock clock_boottime = { .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining= common_hrtimer_remaining, }; static const struct k_clock * const posix_clocks[] = { -- cgit v1.2.3 From eabdec04385376d560078992710104cc7be2ce1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:51 +0200 Subject: posix-timers: Zero settings value in common code Zero out the settings struct in the common code so the callbacks do not have to do it themself. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.200870713@linutronix.de --- kernel/time/posix-cpu-timers.c | 5 +---- kernel/time/posix-timers.c | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 96c833a61ade..cb4a4eb44279 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -719,10 +719,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp */ itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); - if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; + if (!timer->it.cpu.expires) return; - } /* * Sample the clock to take the difference with the expiry time. @@ -746,7 +744,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp * Call the timer disarmed, nothing else to do. */ timer->it.cpu.expires = 0; - itp->it_value = ns_to_timespec64(timer->it.cpu.expires); return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 48f6c37ae5df..0332f7a60fd6 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -645,8 +645,6 @@ common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - memset(cur_setting, 0, sizeof(*cur_setting)); - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE; iv = timr->it_interval; @@ -705,6 +703,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (!timr) return -EINVAL; + memset(&cur_setting64, 0, sizeof(cur_setting64)); kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; -- cgit v1.2.3 From 525b8ed91671e29e187dfe02d408b11190ccf494 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:52 +0200 Subject: posix-timers: Add cancel/arm callbacks Add timer_try_to_cancel() and timer_arm() callbacks to kclock which allow to make common_timer_set() usable by both hrtimer and alarmtimer based clocks. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.278022962@linutronix.de --- kernel/time/posix-timers.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 3bc5b74c342f..b0ad77e18886 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -21,6 +21,9 @@ struct k_clock { void (*timer_rearm)(struct k_itimer *timr); int (*timer_forward)(struct k_itimer *timr, ktime_t now); ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); + int (*timer_try_to_cancel)(struct k_itimer *timr); + void (*timer_arm)(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none); }; extern const struct k_clock clock_posix_cpu; -- cgit v1.2.3 From eae1c4ae275fe3e024454c012a548ee0d700f54c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:53 +0200 Subject: posix-timers: Make use of cancel/arm callbacks Replace the hrtimer calls by calls to the new try_to_cancel()/arm() kclock callbacks and move the hrtimer specific implementation into the corresponding callback functions. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.355396667@linutronix.de --- kernel/time/posix-timers.c | 181 +++++++++++++++++++++++++-------------------- 1 file changed, 100 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0332f7a60fd6..8acc9ee2c2d6 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -744,25 +744,49 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) return overrun; } +static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; + + mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.function = posix_timer_fn; + + if (!absolute) + expires = ktime_add_safe(expires, timer->base->get_time()); + hrtimer_set_expires(timer, expires); + + if (!sigev_none) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); +} + +static int common_hrtimer_try_to_cancel(struct k_itimer *timr) +{ + return hrtimer_try_to_cancel(&timr->it.real.timer); +} + /* Set a POSIX.1b interval timer. */ -/* timr->it_lock is taken. */ static int common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) { - struct hrtimer *timer = &timr->it.real.timer; - enum hrtimer_mode mode; + const struct k_clock *kc = timr->kclock; + bool sigev_none; + ktime_t expires; if (old_setting) common_timer_get(timr, old_setting); - /* disable the timer */ + /* Prevent rearming by clearing the interval */ timr->it_interval = 0; /* - * careful here. If smp we could be in the "fire" routine which will - * be spinning as we hold the lock. But this is ONLY an SMP issue. + * Careful here. On SMP systems the timer expiry function could be + * active and spinning on timr->it_lock. */ - if (hrtimer_try_to_cancel(timer) < 0) + if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; timr->it_active = 0; @@ -770,30 +794,16 @@ common_timer_set(struct k_itimer *timr, int flags, ~REQUEUE_PENDING; timr->it_overrun_last = 0; - /* switch off the timer when it_value is zero */ + /* Switch off the timer when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; - - hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); - - /* Convert interval */ timr->it_interval = timespec64_to_ktime(new_setting->it_interval); + expires = timespec64_to_ktime(new_setting->it_value); + sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; - /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { - /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) { - hrtimer_add_expires(timer, timer->base->get_time()); - } - return 0; - } - - timr->it_active = 1; - hrtimer_start_expires(timer, mode); + kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); + timr->it_active = !sigev_none; return 0; } @@ -847,9 +857,10 @@ retry: static int common_timer_del(struct k_itimer *timer) { - timer->it_interval = 0; + const struct k_clock *kc = timer->kclock; - if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) + timer->it_interval = 0; + if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; timer->it_active = 0; return 0; @@ -1063,76 +1074,84 @@ long clock_nanosleep_restart(struct restart_block *restart_block) } static const struct k_clock clock_realtime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining= common_hrtimer_remaining, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining= common_hrtimer_remaining, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic_raw = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, }; static const struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, }; static const struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, }; static const struct k_clock clock_tai = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining= common_hrtimer_remaining, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining= common_hrtimer_remaining, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock * const posix_clocks[] = { -- cgit v1.2.3 From b3db80f77a95a45dbb2136f7b2a364dc797ea914 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:54 +0200 Subject: alarmtimer: Implement timer_rearm() callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.434598989@linutronix.de --- kernel/time/alarmtimer.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5b8cf4b61854..be85e3cbfe1b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -536,6 +536,18 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, return result; } +/** + * alarm_timer_rearm - Posix timer callback for rearming timer + * @timr: Pointer to the posixtimer data struct + */ +static void alarm_timer_rearm(struct k_itimer *timr) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); + alarm_start(alarm, alarm->node.expires); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -594,7 +606,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) /** * alarm_timer_get - posix timer_get interface - * @new_timer: k_itimer pointer + * @timr: k_itimer pointer * @cur_setting: itimerspec data to fill * * Copies out the current itimerspec data @@ -863,6 +875,7 @@ const struct k_clock alarm_clock = { .timer_set = alarm_timer_set, .timer_del = alarm_timer_del, .timer_get = alarm_timer_get, + .timer_rearm = alarm_timer_rearm, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ -- cgit v1.2.3 From e7561f1633ac735df48c55ad09a2530e9ab9fab1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:55 +0200 Subject: alarmtimer: Implement forward callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.513694229@linutronix.de --- kernel/time/alarmtimer.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index be85e3cbfe1b..6082cf1af876 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -548,6 +548,18 @@ static void alarm_timer_rearm(struct k_itimer *timr) alarm_start(alarm, alarm->node.expires); } +/** + * alarm_timer_forward - Posix timer callback for forwarding timer + * @timr: Pointer to the posixtimer data struct + * @now: Current time to forward the timer against + */ +static int alarm_timer_forward(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return (int) alarm_forward(alarm, timr->it_interval, now); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -876,6 +888,7 @@ const struct k_clock alarm_clock = { .timer_del = alarm_timer_del, .timer_get = alarm_timer_get, .timer_rearm = alarm_timer_rearm, + .timer_forward = alarm_timer_forward, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ -- cgit v1.2.3 From d653d8457c76da11f047af1f66256ac9b8421b69 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:56 +0200 Subject: alarmtimer: Implement remaining callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.592676753@linutronix.de --- kernel/time/alarmtimer.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 6082cf1af876..02ddc40f19fe 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -560,6 +560,18 @@ static int alarm_timer_forward(struct k_itimer *timr, ktime_t now) return (int) alarm_forward(alarm, timr->it_interval, now); } +/** + * alarm_timer_remaining - Posix timer callback to retrieve remaining time + * @timr: Pointer to the posixtimer data struct + * @now: Current time to calculate against + */ +static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return ktime_sub(now, alarm->node.expires); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -881,15 +893,16 @@ out: } const struct k_clock alarm_clock = { - .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, - .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, - .timer_rearm = alarm_timer_rearm, - .timer_forward = alarm_timer_forward, - .nsleep = alarm_timer_nsleep, + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .timer_rearm = alarm_timer_rearm, + .timer_forward = alarm_timer_forward, + .timer_remaining = alarm_timer_remaining, + .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ -- cgit v1.2.3 From e344c9e76bc6af997926171bfd90d25bbae0a2c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:57 +0200 Subject: alarmtimer: Implement try_to_cancel callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.670026824@linutronix.de --- kernel/time/alarmtimer.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 02ddc40f19fe..374bd855a488 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -572,6 +572,15 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) return ktime_sub(now, alarm->node.expires); } +/** + * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer + * @timr: Pointer to the posixtimer data struct + */ +static int alarm_timer_try_to_cancel(struct k_itimer *timr) +{ + return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -902,6 +911,7 @@ const struct k_clock alarm_clock = { .timer_rearm = alarm_timer_rearm, .timer_forward = alarm_timer_forward, .timer_remaining = alarm_timer_remaining, + .timer_try_to_cancel = alarm_timer_try_to_cancel, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ -- cgit v1.2.3 From b3bf6f369d50ece9dec6338741648005d95c19e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:58 +0200 Subject: alarmtimer: Implement arm callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.747567162@linutronix.de --- kernel/time/alarmtimer.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 374bd855a488..c618a44bb054 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -581,6 +581,27 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr) return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); } +/** + * alarm_timer_arm - Posix timer callback to arm a timer + * @timr: Pointer to the posixtimer data struct + * @expires: The new expiry time + * @absolute: Expiry value is absolute time + * @sigev_none: Posix timer does not deliver signals + */ +static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + struct alarm_base *base = &alarm_bases[alarm->type]; + + if (!absolute) + expires = ktime_add_safe(expires, base->gettime()); + if (sigev_none) + alarm->node.expires = expires; + else + alarm_start(&timr->it.alarm.alarmtimer, expires); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -908,6 +929,7 @@ const struct k_clock alarm_clock = { .timer_set = alarm_timer_set, .timer_del = alarm_timer_del, .timer_get = alarm_timer_get, + .timer_arm = alarm_timer_arm, .timer_rearm = alarm_timer_rearm, .timer_forward = alarm_timer_forward, .timer_remaining = alarm_timer_remaining, -- cgit v1.2.3 From f2c45807d3992fe0f173f34af9c347d907c31686 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:59 +0200 Subject: alarmtimer: Switch over to generic set/get/rearm routine All required callbacks are in place. Switch the alarm timer based posix interval timer callbacks to the common implementation and remove the incorrect private implementation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.825471962@linutronix.de --- kernel/time/alarmtimer.c | 121 +++++++-------------------------------------- kernel/time/posix-timers.c | 12 ++--- kernel/time/posix-timers.h | 6 +++ 3 files changed, 29 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c618a44bb054..d8a7a7e214de 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -515,20 +515,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { - unsigned long flags; struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarm.alarmtimer); + it.alarm.alarmtimer); enum alarmtimer_restart result = ALARMTIMER_NORESTART; + unsigned long flags; + int si_private = 0; spin_lock_irqsave(&ptr->it_lock, flags); - if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (posix_timer_event(ptr, 0)) - ptr->it_overrun++; - } - /* Re-add periodic timers */ - if (ptr->it_interval) { - ptr->it_overrun += alarm_forward(alarm, now, ptr->it_interval); + ptr->it_active = 0; + if (ptr->it_interval) + si_private = ++ptr->it_requeue_pending; + + if (posix_timer_event(ptr, si_private) && ptr->it_interval) { + /* + * Handle ignored signals and rearm the timer. This will go + * away once we handle ignored signals proper. + */ + ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); + ++ptr->it_requeue_pending; + ptr->it_active = 1; result = ALARMTIMER_RESTART; } spin_unlock_irqrestore(&ptr->it_lock, flags); @@ -658,97 +664,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return 0; } -/** - * alarm_timer_get - posix timer_get interface - * @timr: k_itimer pointer - * @cur_setting: itimerspec data to fill - * - * Copies out the current itimerspec data - */ -static void alarm_timer_get(struct k_itimer *timr, - struct itimerspec64 *cur_setting) -{ - ktime_t relative_expiry_time = - alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); - - if (ktime_to_ns(relative_expiry_time) > 0) { - cur_setting->it_value = ktime_to_timespec64(relative_expiry_time); - } else { - cur_setting->it_value.tv_sec = 0; - cur_setting->it_value.tv_nsec = 0; - } - - cur_setting->it_interval = ktime_to_timespec64(timr->it_interval); -} - -/** - * alarm_timer_del - posix timer_del interface - * @timr: k_itimer pointer to be deleted - * - * Cancels any programmed alarms for the given timer. - */ -static int alarm_timer_del(struct k_itimer *timr) -{ - if (!rtcdev) - return -ENOTSUPP; - - if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) - return TIMER_RETRY; - - return 0; -} - -/** - * alarm_timer_set - posix timer_set interface - * @timr: k_itimer pointer to be deleted - * @flags: timer flags - * @new_setting: itimerspec to be used - * @old_setting: itimerspec being replaced - * - * Sets the timer to new_setting, and starts the timer. - */ -static int alarm_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting) -{ - ktime_t exp; - - if (!rtcdev) - return -ENOTSUPP; - - if (flags & ~TIMER_ABSTIME) - return -EINVAL; - - if (old_setting) - alarm_timer_get(timr, old_setting); - - /* If the timer was already set, cancel it */ - if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) - return TIMER_RETRY; - - /* start the timer */ - timr->it_interval = timespec64_to_ktime(new_setting->it_interval); - - /* - * Rate limit to the tick as a hot fix to prevent DOS. Will be - * mopped up later. - */ - if (timr->it_interval < TICK_NSEC) - timr->it_interval = TICK_NSEC; - - exp = timespec64_to_ktime(new_setting->it_value); - /* Convert (if necessary) to absolute time */ - if (flags != TIMER_ABSTIME) { - ktime_t now; - - now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); - exp = ktime_add_safe(now, exp); - } - - alarm_start(&timr->it.alarm.alarmtimer, exp); - return 0; -} - /** * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep * @alarm: ptr to alarm that fired @@ -926,9 +841,9 @@ const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, .clock_get = alarm_clock_get, .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, + .timer_set = common_timer_set, + .timer_del = common_timer_del, + .timer_get = common_timer_get, .timer_arm = alarm_timer_arm, .timer_rearm = alarm_timer_rearm, .timer_forward = alarm_timer_forward, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 8acc9ee2c2d6..6e7a70b1bf37 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -637,8 +637,7 @@ static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now) * it is the same as a requeue pending timer WRT to what we should * report. */ -static void -common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; @@ -768,10 +767,9 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr) } /* Set a POSIX.1b interval timer. */ -static int -common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting) +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) { const struct k_clock *kc = timr->kclock; bool sigev_none; @@ -855,7 +853,7 @@ retry: return error; } -static int common_timer_del(struct k_itimer *timer) +int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index b0ad77e18886..b086f5ba2f5b 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -33,3 +33,9 @@ extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; int posix_timer_event(struct k_itimer *timr, int si_private); + +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); +int common_timer_del(struct k_itimer *timer); -- cgit v1.2.3 From f91840a32deef5cb1bf73338bc5010f843b01426 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Jun 2017 21:03:52 -0700 Subject: perf, bpf: Add BPF support to all perf_event types Allow BPF_PROG_TYPE_PERF_EVENT program types to attach to all perf_event types, including HW_CACHE, RAW, and dynamic pmu events. Only tracepoint/kprobe events are treated differently which require BPF_PROG_TYPE_TRACEPOINT/BPF_PROG_TYPE_KPROBE program types accordingly. Also add support for reading all event counters using bpf_perf_event_read() helper. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 28 +++++++--------------------- kernel/events/core.c | 47 ++++++++++++++++++++++++++++------------------- kernel/trace/bpf_trace.c | 22 ++++++++-------------- 3 files changed, 43 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 172dc8ee0e3b..ecb43542246e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -452,38 +452,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - const struct perf_event_attr *attr; struct bpf_event_entry *ee; struct perf_event *event; struct file *perf_file; + u64 value; perf_file = perf_event_get(fd); if (IS_ERR(perf_file)) return perf_file; + ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - ee = ERR_PTR(-EINVAL); - - attr = perf_event_attrs(event); - if (IS_ERR(attr) || attr->inherit) + if (perf_event_read_local(event, &value) == -EOPNOTSUPP) goto err_out; - switch (attr->type) { - case PERF_TYPE_SOFTWARE: - if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) - goto err_out; - /* fall-through */ - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - ee = bpf_event_entry_gen(perf_file, map_file); - if (ee) - return ee; - ee = ERR_PTR(-ENOMEM); - /* fall-through */ - default: - break; - } - + ee = bpf_event_entry_gen(perf_file, map_file); + if (ee) + return ee; + ee = ERR_PTR(-ENOMEM); err_out: fput(perf_file); return ee; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..51e40e4876c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3636,10 +3636,10 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -u64 perf_event_read_local(struct perf_event *event) +int perf_event_read_local(struct perf_event *event, u64 *value) { unsigned long flags; - u64 val; + int ret = 0; /* * Disabling interrupts avoids all counter scheduling (context @@ -3647,25 +3647,37 @@ u64 perf_event_read_local(struct perf_event *event) */ local_irq_save(flags); - /* If this is a per-task event, it must be for current */ - WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && - event->hw.target != current); - - /* If this is a per-CPU event, it must be for this CPU */ - WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && - event->cpu != smp_processor_id()); - /* * It must not be an event with inherit set, we cannot read * all child counters from atomic context. */ - WARN_ON_ONCE(event->attr.inherit); + if (event->attr.inherit) { + ret = -EOPNOTSUPP; + goto out; + } /* * It must not have a pmu::count method, those are not * NMI safe. */ - WARN_ON_ONCE(event->pmu->count); + if (event->pmu->count) { + ret = -EOPNOTSUPP; + goto out; + } + + /* If this is a per-task event, it must be for current */ + if ((event->attach_state & PERF_ATTACH_TASK) && + event->hw.target != current) { + ret = -EINVAL; + goto out; + } + + /* If this is a per-CPU event, it must be for this CPU */ + if (!(event->attach_state & PERF_ATTACH_TASK) && + event->cpu != smp_processor_id()) { + ret = -EINVAL; + goto out; + } /* * If the event is currently on this CPU, its either a per-task event, @@ -3675,10 +3687,11 @@ u64 perf_event_read_local(struct perf_event *event) if (event->oncpu == smp_processor_id()) event->pmu->read(event); - val = local64_read(&event->count); + *value = local64_read(&event->count); +out: local_irq_restore(flags); - return val; + return ret; } static int perf_event_read(struct perf_event *event, bool group) @@ -8037,12 +8050,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) bool is_kprobe, is_tracepoint; struct bpf_prog *prog; - if (event->attr.type == PERF_TYPE_HARDWARE || - event->attr.type == PERF_TYPE_SOFTWARE) - return perf_event_set_bpf_handler(event, prog_fd); - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; + return perf_event_set_bpf_handler(event, prog_fd); if (event->tp_event->prog) return -EEXIST; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 460a031c77e5..08eb072430b9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -234,7 +234,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - struct perf_event *event; + u64 value = 0; + int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -247,21 +248,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - event = ee->event; - if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && - event->attr.type != PERF_TYPE_RAW)) - return -EINVAL; - - /* make sure event is local and doesn't have pmu::count */ - if (unlikely(event->oncpu != cpu || event->pmu->count)) - return -EINVAL; - + err = perf_event_read_local(ee->event, &value); /* - * we don't know if the function is run successfully by the - * return value. It can be judged in other places, such as - * eBPF programs. + * this api is ugly since we miss [-22..-2] range of valid + * counter values, but that's uapi */ - return perf_event_read_local(event); + if (err) + return err; + return value; } static const struct bpf_func_proto bpf_perf_event_read_proto = { -- cgit v1.2.3 From f99973e18b65ca1fff8c81532e3132b8f622aea8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 1 Jun 2017 16:47:09 +0200 Subject: nohz: Fix buggy tick delay on IRQ storms When the tick is stopped and we reach the dynticks evaluation code on IRQ exit, we perform a soft tick restart if we observe an expired timer from there. It means we program the nearest possible tick but we stay in dynticks mode (ts->tick_stopped = 1) because we may need to stop the tick again after that expired timer is handled. Now this solution works most of the time but if we suffer an IRQ storm and those interrupts trigger faster than the hardware clockevents min delay, our tick won't fire until that IRQ storm is finished. Here is the problem: on IRQ exit we reprog the timer to at least NOW() + min_clockevents_delay. Another IRQ fires before the tick so we reschedule again to NOW() + min_clockevents_delay, etc... The tick is eternally rescheduled min_clockevents_delay ahead. A solution is to simply remove this soft tick restart. After all the normal dynticks evaluation path can handle 0 delay just fine. And by doing that we benefit from the optimization branch which avoids clock reprogramming if the clockevents deadline hasn't changed since the last reprog. This fixes our issue because we don't do repetitive clock reprog that always add hardware min delay. As a side effect it should even optimize the 0 delay path in general. Reported-and-tested-by: Octavian Purdila Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1496328429-13317-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e3043873fcdc..9d31f1e0067b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -713,8 +713,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { - tick = 0; - /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): @@ -724,23 +722,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ - if (!ts->tick_stopped) - goto out; - - /* - * If, OTOH, we did stop it, but there's a pending (expired) - * timer reprogram the timer hardware to fire now. - * - * We will not restart the tick proper, just prod the timer - * hardware into firing an interrupt to process the pending - * timers. Just like tick_irq_exit() will not restart the tick - * for 'normal' interrupts. - * - * Only once we exit the idle loop will we re-enable the tick, - * see tick_nohz_idle_exit(). - */ - if (delta == 0) { - tick_nohz_restart(ts, now); + if (!ts->tick_stopped) { + tick = 0; goto out; } } -- cgit v1.2.3 From 680895d6efe47332d25e49817d2d6781295c1614 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 May 2017 09:51:09 +0200 Subject: sysctl: switch to use uuid_t Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Andy Shevchenko --- kernel/sysctl_binary.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ece4b177052b..939a158eab11 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1119,7 +1119,7 @@ static ssize_t bin_uuid(struct file *file, /* Only supports reads */ if (oldval && oldlen) { char buf[UUID_STRING_LEN + 1]; - uuid_be uuid; + uuid_t uuid; result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) @@ -1128,7 +1128,7 @@ static ssize_t bin_uuid(struct file *file, buf[result] = '\0'; result = -EIO; - if (uuid_be_to_bin(buf, &uuid)) + if (uuid_parse(buf, &uuid)) goto out; if (oldlen > 16) -- cgit v1.2.3 From dc4bb0e2356149aee4cdae061936f3bbdd45595c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:46 -0700 Subject: bpf: Introduce bpf_prog ID This patch generates an unique ID for each BPF_PROG_LOAD-ed prog. It is worth to note that each BPF_PROG_LOAD-ed prog will have a different ID even they have the same bpf instructions. The ID is generated by the existing idr_alloc_cyclic(). The ID is ranged from [1, INT_MAX). It is allocated in cyclic manner, so an ID will get reused every 2 billion BPF_PROG_LOAD. The bpf_prog_alloc_id() is done after bpf_prog_select_runtime() because the jit process may have allocated a new prog. Hence, we need to ensure the value of pointer 'prog' will not be changed any more before storing the prog to the prog_idr. After bpf_prog_select_runtime(), the prog is read-only. Hence, the id is stored in 'struct bpf_prog_aux'. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 59da103adb85..2a1b32b470f1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -22,8 +22,11 @@ #include #include #include +#include DEFINE_PER_CPU(int, bpf_prog_active); +static DEFINE_IDR(prog_idr); +static DEFINE_SPINLOCK(prog_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -650,6 +653,34 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } +static int bpf_prog_alloc_id(struct bpf_prog *prog) +{ + int id; + + spin_lock_bh(&prog_idr_lock); + id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + prog->aux->id = id; + spin_unlock_bh(&prog_idr_lock); + + /* id is in [1, INT_MAX) */ + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void bpf_prog_free_id(struct bpf_prog *prog) +{ + /* cBPF to eBPF migrations are currently not in the idr store. */ + if (!prog->aux->id) + return; + + spin_lock_bh(&prog_idr_lock); + idr_remove(&prog_idr, prog->aux->id); + spin_unlock_bh(&prog_idr_lock); +} + static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -663,6 +694,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); + bpf_prog_free_id(prog); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -857,15 +889,21 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; + err = bpf_prog_alloc_id(prog); + if (err) + goto free_used_maps; + err = bpf_prog_new_fd(prog); if (err < 0) /* failed to allocate fd */ - goto free_used_maps; + goto free_id; bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); return err; +free_id: + bpf_prog_free_id(prog); free_used_maps: free_used_maps(prog->aux); free_prog: -- cgit v1.2.3 From f3f1c054c288bb6e503005e6d73611151ed20e91 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:47 -0700 Subject: bpf: Introduce bpf_map ID This patch generates an unique ID for each created bpf_map. The approach is similar to the earlier patch for bpf_prog ID. It is worth to note that the bpf_map's ID and bpf_prog's ID are in two independent ID spaces and both have the same valid range: [1, INT_MAX). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2a1b32b470f1..4c3075b5d840 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,8 @@ DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); +static DEFINE_IDR(map_idr); +static DEFINE_SPINLOCK(map_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -117,6 +119,29 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map) free_uid(user); } +static int bpf_map_alloc_id(struct bpf_map *map) +{ + int id; + + spin_lock_bh(&map_idr_lock); + id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + map->id = id; + spin_unlock_bh(&map_idr_lock); + + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void bpf_map_free_id(struct bpf_map *map) +{ + spin_lock_bh(&map_idr_lock); + idr_remove(&map_idr, map->id); + spin_unlock_bh(&map_idr_lock); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { @@ -141,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map) void bpf_map_put(struct bpf_map *map) { if (atomic_dec_and_test(&map->refcnt)) { + bpf_map_free_id(map); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } @@ -239,14 +265,20 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; + err = bpf_map_alloc_id(map); + if (err) + goto free_map; + err = bpf_map_new_fd(map); if (err < 0) /* failed to allocate fd */ - goto free_map; + goto free_id; trace_bpf_map_create(map, err); return err; +free_id: + bpf_map_free_id(map); free_map: bpf_map_uncharge_memlock(map); free_map_nouncharge: -- cgit v1.2.3 From 34ad5580f8f9c86cb273ebea25c149613cd1667e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:48 -0700 Subject: bpf: Add BPF_(PROG|MAP)_GET_NEXT_ID command This patch adds BPF_PROG_GET_NEXT_ID and BPF_MAP_GET_NEXT_ID to allow userspace to iterate all bpf_prog IDs and bpf_map IDs. The API is trying to be consistent with the existing BPF_MAP_GET_NEXT_KEY. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4c3075b5d840..2405feedb8c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -166,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map) void bpf_map_put(struct bpf_map *map) { if (atomic_dec_and_test(&map->refcnt)) { + /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); @@ -726,6 +727,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); + /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); @@ -1069,6 +1071,34 @@ static int bpf_prog_test_run(const union bpf_attr *attr, return ret; } +#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id + +static int bpf_obj_get_next_id(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct idr *idr, + spinlock_t *lock) +{ + u32 next_id = attr->start_id; + int err = 0; + + if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + next_id++; + spin_lock_bh(lock); + if (!idr_get_next(idr, &next_id)) + err = -ENOENT; + spin_unlock_bh(lock); + + if (!err) + err = put_user(next_id, &uattr->next_id); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1146,6 +1176,14 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; + case BPF_PROG_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &prog_idr, &prog_idr_lock); + break; + case BPF_MAP_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &map_idr, &map_idr_lock); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From b16d9aa4c2b90af8d2c3201e245150f8c430c3bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:49 -0700 Subject: bpf: Add BPF_PROG_GET_FD_BY_ID Add BPF_PROG_GET_FD_BY_ID command to allow user to get a fd from a bpf_prog's ID. bpf_prog_inc_not_zero() is added and is called with prog_idr_lock held. __bpf_prog_put() is also added which has the 'bool do_idr_lock' param to decide if the prog_idr_lock should be acquired when freeing the prog->id. In the error path of bpf_prog_inc_not_zero(), it may have to call __bpf_prog_put(map, false) which does not need to take the prog_idr_lock when freeing the prog->id. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2405feedb8c1..dc6253bb8ebb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -703,15 +703,23 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog) +static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { /* cBPF to eBPF migrations are currently not in the idr store. */ if (!prog->aux->id) return; - spin_lock_bh(&prog_idr_lock); + if (do_idr_lock) + spin_lock_bh(&prog_idr_lock); + else + __acquire(&prog_idr_lock); + idr_remove(&prog_idr, prog->aux->id); - spin_unlock_bh(&prog_idr_lock); + + if (do_idr_lock) + spin_unlock_bh(&prog_idr_lock); + else + __release(&prog_idr_lock); } static void __bpf_prog_put_rcu(struct rcu_head *rcu) @@ -723,16 +731,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -void bpf_prog_put(struct bpf_prog *prog) +static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ - bpf_prog_free_id(prog); + bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } + +void bpf_prog_put(struct bpf_prog *prog) +{ + __bpf_prog_put(prog, true); +} EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) @@ -814,6 +827,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc); +/* prog_idr_lock should have been held */ +static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) +{ + int refold; + + refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_prog_put(prog, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + return prog; +} + static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); @@ -928,16 +959,21 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; err = bpf_prog_new_fd(prog); - if (err < 0) - /* failed to allocate fd */ - goto free_id; + if (err < 0) { + /* failed to allocate fd. + * bpf_prog_put() is needed because the above + * bpf_prog_alloc_id() has published the prog + * to the userspace and the userspace may + * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. + */ + bpf_prog_put(prog); + return err; + } bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); return err; -free_id: - bpf_prog_free_id(prog); free_used_maps: free_used_maps(prog->aux); free_prog: @@ -1099,6 +1135,38 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&prog_idr_lock); + prog = idr_find(&prog_idr, id); + if (prog) + prog = bpf_prog_inc_not_zero(prog); + else + prog = ERR_PTR(-ENOENT); + spin_unlock_bh(&prog_idr_lock); + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_prog_new_fd(prog); + if (fd < 0) + bpf_prog_put(prog); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1184,6 +1252,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &map_idr, &map_idr_lock); break; + case BPF_PROG_GET_FD_BY_ID: + err = bpf_prog_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From bd5f5f4ecb78e2698dad655645b6d6a2f7012a8c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:50 -0700 Subject: bpf: Add BPF_MAP_GET_FD_BY_ID Add BPF_MAP_GET_FD_BY_ID command to allow user to get a fd from a bpf_map's ID. bpf_map_inc_not_zero() is added and is called with map_idr_lock held. __bpf_map_put() is also added which has the 'bool do_idr_lock' param to decide if the map_idr_lock should be acquired when freeing the map->id. In the error path of bpf_map_inc_not_zero(), it may have to call __bpf_map_put(map, false) which does not need to take the map_idr_lock when freeing the map->id. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 85 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dc6253bb8ebb..1802bb9c47d9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -135,11 +135,19 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map) +static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { - spin_lock_bh(&map_idr_lock); + if (do_idr_lock) + spin_lock_bh(&map_idr_lock); + else + __acquire(&map_idr_lock); + idr_remove(&map_idr, map->id); - spin_unlock_bh(&map_idr_lock); + + if (do_idr_lock) + spin_unlock_bh(&map_idr_lock); + else + __release(&map_idr_lock); } /* called from workqueue */ @@ -163,16 +171,21 @@ static void bpf_map_put_uref(struct bpf_map *map) /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ -void bpf_map_put(struct bpf_map *map) +static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { if (atomic_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ - bpf_map_free_id(map); + bpf_map_free_id(map, do_idr_lock); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } } +void bpf_map_put(struct bpf_map *map) +{ + __bpf_map_put(map, true); +} + void bpf_map_put_with_uref(struct bpf_map *map) { bpf_map_put_uref(map); @@ -271,15 +284,20 @@ static int map_create(union bpf_attr *attr) goto free_map; err = bpf_map_new_fd(map); - if (err < 0) - /* failed to allocate fd */ - goto free_id; + if (err < 0) { + /* failed to allocate fd. + * bpf_map_put() is needed because the above + * bpf_map_alloc_id() has published the map + * to the userspace and the userspace may + * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. + */ + bpf_map_put(map); + return err; + } trace_bpf_map_create(map, err); return err; -free_id: - bpf_map_free_id(map); free_map: bpf_map_uncharge_memlock(map); free_map_nouncharge: @@ -331,6 +349,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } +/* map_idr_lock should have been held */ +static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, + bool uref) +{ + int refold; + + refold = __atomic_add_unless(&map->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_map_put(map, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + if (uref) + atomic_inc(&map->usercnt); + + return map; +} + int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -1167,6 +1207,38 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) return fd; } +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id + +static int bpf_map_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_map *map; + u32 id = attr->map_id; + int fd; + + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&map_idr_lock); + map = idr_find(&map_idr, id); + if (map) + map = bpf_map_inc_not_zero(map, true); + else + map = ERR_PTR(-ENOENT); + spin_unlock_bh(&map_idr_lock); + + if (IS_ERR(map)) + return PTR_ERR(map); + + fd = bpf_map_new_fd(map); + if (fd < 0) + bpf_map_put(map); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1255,6 +1327,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; + case BPF_MAP_GET_FD_BY_ID: + err = bpf_map_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 1e270976908686ec25fb91b8a34145be54137976 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:52 -0700 Subject: bpf: Add BPF_OBJ_GET_INFO_BY_FD A single BPF_OBJ_GET_INFO_BY_FD cmd is used to obtain the info for both bpf_prog and bpf_map. The kernel can figure out the fd is associated with a bpf_prog or bpf_map. The suggested struct bpf_prog_info and struct bpf_map_info are not meant to be a complete list and it is not the goal of this patch. New fields can be added in the future patch. The focus of this patch is to create the interface, BPF_OBJ_GET_INFO_BY_FD cmd for exposing the bpf_prog's and bpf_map's info. The obj's info, which will be extended (and get bigger) over time, is separated from the bpf_attr to avoid bloating the bpf_attr. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 163 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 146 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1802bb9c47d9..8942c820d620 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1239,6 +1239,145 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static int check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) +{ + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + int err; + + if (actual_size <= expected_size) + return 0; + + addr = uaddr + expected_size; + end = uaddr + actual_size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + + return 0; +} + +static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_prog_info info = {}; + u32 info_len = attr->info.info_len; + char __user *uinsns; + u32 ulen; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + if (copy_from_user(&info, uinfo, info_len)) + return err; + + info.type = prog->type; + info.id = prog->aux->id; + + memcpy(info.tag, prog->tag, sizeof(prog->tag)); + + if (!capable(CAP_SYS_ADMIN)) { + info.jited_prog_len = 0; + info.xlated_prog_len = 0; + goto done; + } + + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } + + ulen = info.xlated_prog_len; + info.xlated_prog_len = bpf_prog_size(prog->len); + if (info.xlated_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + if (copy_to_user(uinsns, prog->insnsi, ulen)) + return -EFAULT; + } + +done: + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +static int bpf_map_get_info_by_fd(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_map_info info = {}; + u32 info_len = attr->info.info_len; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + info.type = map->map_type; + info.id = map->id; + info.key_size = map->key_size; + info.value_size = map->value_size; + info.max_entries = map->max_entries; + info.map_flags = map->map_flags; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info + +static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ufd = attr->info.bpf_fd; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) + return -EINVAL; + + f = fdget(ufd); + if (!f.file) + return -EBADFD; + + if (f.file->f_op == &bpf_prog_fops) + err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + uattr); + else if (f.file->f_op == &bpf_map_fops) + err = bpf_map_get_info_by_fd(f.file->private_data, attr, + uattr); + else + err = -EINVAL; + + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1258,23 +1397,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz * user-space does not rely on any kernel feature * extensions we dont know about yet. */ - if (size > sizeof(attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - size = sizeof(attr); - } + err = check_uarg_tail_zero(uattr, sizeof(attr), size); + if (err) + return err; + size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ if (copy_from_user(&attr, uattr, size) != 0) @@ -1330,6 +1456,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_FD_BY_ID: err = bpf_map_get_fd_by_id(&attr); break; + case BPF_OBJ_GET_INFO_BY_FD: + err = bpf_obj_get_info_by_fd(&attr, uattr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 92046578ac88e0a93f8ef03240e6c832b0189aa7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 6 Jun 2017 18:38:04 +0200 Subject: bpf: cgroup skb progs cannot access ld_abs/ind Commit fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff") enabled programs of BPF_PROG_TYPE_CGROUP_SKB type to use ld_abs/ind instructions. However, at this point, we cannot use them, since offsets relative to SKF_LL_OFF will end up pointing skb_mac_header(skb) out of bounds since in the egress path it is not yet set at that point in time, but only after __dev_queue_xmit() did a general reset on the mac header. bpf_internal_load_pointer_neg_helper() will then end up reading data from a wrong offset. BPF_PROG_TYPE_CGROUP_SKB programs can use bpf_skb_load_bytes() already to access packet data, which is also more flexible than the insns carried over from cBPF. Fixes: fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Chenbo Feng Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8acae64df255..14ccb0759fa4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2426,7 +2426,6 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; -- cgit v1.2.3 From f3b7eaae1b35eb8077610eb7c7db042c9b0645e1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 7 Jun 2017 00:57:37 +0200 Subject: Revert "ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle" Revert commit eed4d47efe95 (ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle) as it turned out to be premature and triggered a number of different issues on various systems. That includes, but is not limited to, premature suspend-to-RAM aborts on Dell XPS 13 (9343) reported by Dominik. The issue the commit in question attempted to address is real and will need to be taken care of going forward, but evidently more work is needed for this purpose. Reported-by: Dominik Brodowski Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 2 +- kernel/power/suspend.c | 29 ++++------------------------- 2 files changed, 5 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 78672d324a6e..c7209f060eeb 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -132,7 +132,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(true); + pm_wakeup_clear(); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c0248c74d6d4..15e6baef5c73 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -72,8 +72,6 @@ static void freeze_begin(void) static void freeze_enter(void) { - trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); - spin_lock_irq(&suspend_freeze_lock); if (pm_wakeup_pending()) goto out; @@ -100,27 +98,6 @@ static void freeze_enter(void) out: suspend_freeze_state = FREEZE_STATE_NONE; spin_unlock_irq(&suspend_freeze_lock); - - trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); -} - -static void s2idle_loop(void) -{ - do { - freeze_enter(); - - if (freeze_ops && freeze_ops->wake) - freeze_ops->wake(); - - dpm_resume_noirq(PMSG_RESUME); - if (freeze_ops && freeze_ops->sync) - freeze_ops->sync(); - - if (pm_wakeup_pending()) - break; - - pm_wakeup_clear(false); - } while (!dpm_suspend_noirq(PMSG_SUSPEND)); } void freeze_wake(void) @@ -394,8 +371,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { - s2idle_loop(); - goto Platform_early_resume; + trace_suspend_resume(TPS("machine_suspend"), state, true); + freeze_enter(); + trace_suspend_resume(TPS("machine_suspend"), state, false); + goto Platform_wake; } error = disable_nonboot_cpus(); -- cgit v1.2.3 From c3ca46ef719580eb01994fc6032db470fde92d85 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 23 May 2017 15:05:50 +0900 Subject: ftrace/kprobes: selftests: Check kretprobe maxactive is supported Check the kretprobe maxactive is supported by kprobe_events interface. To ensure the kernel feature, this changes ftrace README to describe it. Signed-off-by: Masami Hiramatsu Signed-off-by: Shuah Khan --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1122f151466f..dc3f91e70345 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4473,7 +4473,8 @@ static const char readme_msg[] = #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p|r[:[/]] []\n" + "\t Format: p[:[/]] []\n" + "\t r[maxactive][:[/]] []\n" "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" -- cgit v1.2.3 From cc1582c231ea041fbc68861dfaf957eaf902b829 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Thu, 25 May 2017 18:09:07 +0800 Subject: perf/core: Drop kernel samples even though :u is specified When doing sampling, for example: perf record -e cycles:u ... On workloads that do a lot of kernel entry/exits we see kernel samples, even though :u is specified. This is due to skid existing. This might be a security issue because it can leak kernel addresses even though kernel sampling support is disabled. The patch drops the kernel samples if exclude_kernel is specified. For example, test on Haswell desktop: perf record -e cycles:u perf report --stdio Before patch applied: 99.77% mgen mgen [.] buf_read 0.20% mgen mgen [.] rand_buf_init 0.01% mgen [kernel.vmlinux] [k] apic_timer_interrupt 0.00% mgen mgen [.] last_free_elem 0.00% mgen libc-2.23.so [.] __random_r 0.00% mgen libc-2.23.so [.] _int_malloc 0.00% mgen mgen [.] rand_array_init 0.00% mgen [kernel.vmlinux] [k] page_fault 0.00% mgen libc-2.23.so [.] __random 0.00% mgen libc-2.23.so [.] __strcasestr 0.00% mgen ld-2.23.so [.] strcmp 0.00% mgen ld-2.23.so [.] _dl_start 0.00% mgen libc-2.23.so [.] sched_setaffinity@@GLIBC_2.3.4 0.00% mgen ld-2.23.so [.] _start We can see kernel symbols apic_timer_interrupt and page_fault. After patch applied: 99.79% mgen mgen [.] buf_read 0.19% mgen mgen [.] rand_buf_init 0.00% mgen libc-2.23.so [.] __random_r 0.00% mgen mgen [.] rand_array_init 0.00% mgen mgen [.] last_free_elem 0.00% mgen libc-2.23.so [.] vfprintf 0.00% mgen libc-2.23.so [.] rand 0.00% mgen libc-2.23.so [.] __random 0.00% mgen libc-2.23.so [.] _int_malloc 0.00% mgen libc-2.23.so [.] _IO_doallocbuf 0.00% mgen ld-2.23.so [.] do_lookup_x 0.00% mgen ld-2.23.so [.] open_verify.constprop.7 0.00% mgen ld-2.23.so [.] _dl_important_hwcaps 0.00% mgen libc-2.23.so [.] sched_setaffinity@@GLIBC_2.3.4 0.00% mgen ld-2.23.so [.] _start There are only userspace symbols. Signed-off-by: Jin Yao Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Cc: kan.liang@intel.com Cc: mark.rutland@arm.com Cc: will.deacon@arm.com Cc: yao.jin@intel.com Link: http://lkml.kernel.org/r/1495706947-3744-1-git-send-email-yao.jin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..6c4e523dc1e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7316,6 +7316,21 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + * To avoid leaking information to userspace, we must always + * reject kernel samples when exclude_kernel is set. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ @@ -7336,6 +7351,12 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + /* + * For security, drop the skid kernel samples if necessary. + */ + if (!sample_is_allowed(event, regs)) + return ret; + /* * XXX event_limit might not quite work as expected on inherited * events -- cgit v1.2.3 From ba5213ae6b88fb170c4771fef6553f759c7d8cdd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 May 2017 11:45:12 +0200 Subject: perf/core: Correct event creation with PERF_FORMAT_GROUP Andi was asking about PERF_FORMAT_GROUP vs inherited events, which led to the discovery of a bug from commit: 3dab77fb1bf8 ("perf: Rework/fix the whole read vs group stuff") - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) is a clear fail :/ While this changes user visible behaviour; it was previously possible to create an inherited event with PERF_SAMPLE_READ; this is deemed acceptible because its results were always incorrect. Reported-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 3dab77fb1bf8 ("perf: Rework/fix the whole read vs group stuff") Link: http://lkml.kernel.org/r/20170530094512.dy2nljns2uq7qa3j@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3de0b98c4414..407dad6cf89a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5729,9 +5729,6 @@ static void perf_output_read_one(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); } -/* - * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. - */ static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -5776,6 +5773,13 @@ static void perf_output_read_group(struct perf_output_handle *handle, #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) +/* + * XXX PERF_SAMPLE_READ vs inherited events seems difficult. + * + * The problem is that its both hard and excessively expensive to iterate the + * child list, not to mention that its impossible to IPI the children running + * on another CPU, from interrupt/NMI context. + */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { @@ -9462,9 +9466,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_FORMAT_GROUP on inherited events + * We currently do not support PERF_SAMPLE_READ on inherited events. + * See perf_output_read(). */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) goto err_ns; if (!has_branch_stack(event)) -- cgit v1.2.3 From d0fabd1cb8b70073a0f44f1cf8b663b5e7241c74 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 23 May 2017 14:51:32 -0700 Subject: perf/core: Remove unused perf_cgroup_event_cgrp_time() function The function was added by commit e5d1367f17ba ("perf: Add cgroup support") in 2011 and hasn't been used since then. Removing it fixes the following warning when building with Clang: kernel/events/core.c:696:19: error: unused function 'perf_cgroup_event_cgrp_time' [-Werror,-Wunused-function] Signed-off-by: Matthias Kaehlcke Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Douglas Anderson Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170523215132.189049-1-mka@chromium.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 407dad6cf89a..bc63f8db1b0d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -925,11 +925,6 @@ static inline int is_cgroup_event(struct perf_event *event) return 0; } -static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) -{ - return 0; -} - static inline void update_cgrp_time_from_event(struct perf_event *event) { } -- cgit v1.2.3 From 1ad3aaf3fcd2444406628a19a9b9e0922b95e2d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 May 2017 12:53:50 +0200 Subject: sched/core: Implement new approach to scale select_idle_cpu() Hackbench recently suffered a bunch of pain, first by commit: 4c77b18cf8b7 ("sched/fair: Make select_idle_cpu() more aggressive") and then by commit: c743f0a5c50f ("sched/fair, cpumask: Export for_each_cpu_wrap()") which fixed a bug in the initial for_each_cpu_wrap() implementation that made select_idle_cpu() even more expensive. The bug was that it would skip over CPUs when bits were consequtive in the bitmask. This however gave me an idea to fix select_idle_cpu(); where the old scheme was a cliff-edge throttle on idle scanning, this introduces a more gradual approach. Instead of stopping to scan entirely, we limit how many CPUs we scan. Initial benchmarks show that it mostly recovers hackbench while not hurting anything else, except Mason's schbench, but not as bad as the old thing. It also appears to recover the tbench high-end, which also suffered like hackbench. Tested-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Mason Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hpa@zytor.com Cc: kitsunyan Cc: linux-kernel@vger.kernel.org Cc: lvenanci@redhat.com Cc: riel@redhat.com Cc: xiaolong.ye@intel.com Link: http://lkml.kernel.org/r/20170517105350.hk5m4h4jb6dfr65a@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 ++++++++++++++++----- kernel/sched/features.h | 1 + 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47a0c552c77b..396bca9c7996 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5794,27 +5794,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct sched_domain *this_sd; - u64 avg_cost, avg_idle = this_rq()->avg_idle; + u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - avg_cost = this_sd->avg_scan_cost; - /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. */ - if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; + + if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) return -1; + if (sched_feat(SIS_PROP)) { + u64 span_avg = sd->span_weight * avg_idle; + if (span_avg > 4*avg_cost) + nr = div_u64(span_avg, avg_cost); + else + nr = 4; + } + time = local_clock(); for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + if (!--nr) + return -1; if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index dc4d1483b038..d3fb15555291 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true) * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) /* * Issue a WARN when we do multiple update_rq_clock() calls -- cgit v1.2.3 From e36d8677bfa55054e4194ec3683189b882a538f6 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:28 +0200 Subject: sched/deadline: Track the active utilization Active utilization is defined as the total utilization of active (TASK_RUNNING) tasks queued on a runqueue. Hence, it is increased when a task wakes up and is decreased when a task blocks. When a task is migrated from CPUi to CPUj, immediately subtract the task's utilization from CPUi and add it to CPUj. This mechanism is implemented by modifying the pull and push functions. Note: this is not fully correct from the theoretical point of view (the utilization should be removed from CPUi only at the 0 lag time), a more theoretically sound solution is presented in the next patches. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Claudio Scordino Cc: Joel Fernandes Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-2-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 6 +++++ 2 files changed, 67 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index df6c2912bd60..b36ecc2b1b10 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static inline +void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw += dl_bw; + SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ +} + +static inline +void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + if (dl_rq->running_bw > old) + dl_rq->running_bw = 0; +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -83,6 +105,8 @@ void init_dl_rq(struct dl_rq *dl_rq) #else init_dl_bw(&dl_rq->dl_bw); #endif + + dl_rq->running_bw = 0; } #ifdef CONFIG_SMP @@ -946,10 +970,14 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + add_running_bw(dl_se->dl_bw, dl_rq); update_dl_entity(dl_se, pi_se); - else if (flags & ENQUEUE_REPLENISH) + } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } __enqueue_dl_entity(dl_se); } @@ -998,14 +1026,25 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) dl_check_constrained_dl(&p->dl); + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) + add_running_bw(p->dl.dl_bw, &rq->dl); + /* - * If p is throttled, we do nothing. In fact, if it exhausted + * If p is throttled, we do not enqueue it. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + add_running_bw(p->dl.dl_bw, &rq->dl); return; + } enqueue_dl_entity(&p->dl, pi_se, flags); @@ -1023,6 +1062,20 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); + + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) + sub_running_bw(p->dl.dl_bw, &rq->dl); + + /* + * This check allows to decrease the active utilization in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + sub_running_bw(p->dl.dl_bw, &rq->dl); } /* @@ -1551,7 +1604,9 @@ retry: } deactivate_task(rq, next_task, 0); + sub_running_bw(next_task->dl.dl_bw, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_running_bw(next_task->dl.dl_bw, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1639,7 +1694,9 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + sub_running_bw(p->dl.dl_bw, &src_rq->dl); set_task_cpu(p, this_cpu); + add_running_bw(p->dl.dl_bw, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f8cf1d87f065..ee26867da339 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -558,6 +558,12 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; }; #ifdef CONFIG_SMP -- cgit v1.2.3 From 209a0cbda7a01d2ea32a8b631d35e873bee498e9 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:29 +0200 Subject: sched/deadline: Improve the tracking of active utilization This patch implements a more theoretically sound algorithm for tracking active utilization: instead of decreasing it when a task blocks, use a timer (the "inactive timer", named after the "Inactive" task state of the GRUB algorithm) to decrease the active utilization at the so called "0-lag time". Tested-by: Claudio Scordino Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-3-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 + kernel/sched/deadline.c | 269 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 2 + 3 files changed, 259 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c3e50cada84d..968c655ec5d9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2153,6 +2153,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; } /* @@ -2184,6 +2185,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); + init_dl_inactive_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2506,6 +2508,7 @@ static int dl_overflow(struct task_struct *p, int policy, !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { __dl_clear(dl_b, p->dl.dl_bw); __dl_add(dl_b, new_bw); + dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { __dl_clear(dl_b, p->dl.dl_bw); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b36ecc2b1b10..6480a929417c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -65,6 +65,161 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw = 0; } +void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + if (task_on_rq_queued(p)) + return; + + if (!p->dl.dl_non_contending) + return; + + sub_running_bw(p->dl.dl_bw, &task_rq(p)->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); +} + +/* + * The utilization of a task cannot be immediately removed from + * the rq active utilization (running_bw) when the task blocks. + * Instead, we have to wait for the so called "0-lag time". + * + * If a task blocks before the "0-lag time", a timer (the inactive + * timer) is armed, and running_bw is decreased when the timer + * fires. + * + * If the task wakes up again before the inactive timer fires, + * the timer is cancelled, whereas if the task wakes up after the + * inactive timer fired (and running_bw has been decreased) the + * task's utilization has to be added to running_bw again. + * A flag in the deadline scheduling entity (dl_non_contending) + * is used to avoid race conditions between the inactive timer handler + * and task wakeups. + * + * The following diagram shows how running_bw is updated. A task is + * "ACTIVE" when its utilization contributes to running_bw; an + * "ACTIVE contending" task is in the TASK_RUNNING state, while an + * "ACTIVE non contending" task is a blocked task for which the "0-lag time" + * has not passed yet. An "INACTIVE" task is a task for which the "0-lag" + * time already passed, which does not contribute to running_bw anymore. + * +------------------+ + * wakeup | ACTIVE | + * +------------------>+ contending | + * | add_running_bw | | + * | +----+------+------+ + * | | ^ + * | dequeue | | + * +--------+-------+ | | + * | | t >= 0-lag | | wakeup + * | INACTIVE |<---------------+ | + * | | sub_running_bw | | + * +--------+-------+ | | + * ^ | | + * | t < 0-lag | | + * | | | + * | V | + * | +----+------+------+ + * | sub_running_bw | ACTIVE | + * +-------------------+ | + * inactive timer | non contending | + * fired +------------------+ + * + * The task_non_contending() function is invoked when a task + * blocks, and checks if the 0-lag time already passed or + * not (in the first case, it directly updates running_bw; + * in the second case, it arms the inactive timer). + * + * The task_contending() function is invoked when a task wakes + * up, and checks if the task is still in the "ACTIVE non contending" + * state or not (in the second case, it updates running_bw). + */ +static void task_non_contending(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->inactive_timer; + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + s64 zerolag_time; + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); + WARN_ON(dl_se->dl_non_contending); + + zerolag_time = dl_se->deadline - + div64_long((dl_se->runtime * dl_se->dl_period), + dl_se->dl_runtime); + + /* + * Using relative times instead of the absolute "0-lag time" + * allows to simplify the code + */ + zerolag_time -= rq_clock(rq); + + /* + * If the "0-lag time" already passed, decrease the active + * utilization now, instead of starting a timer + */ + if (zerolag_time < 0) { + if (dl_task(p)) + sub_running_bw(dl_se->dl_bw, dl_rq); + if (!dl_task(p) || p->state == TASK_DEAD) + __dl_clear_params(p); + + return; + } + + dl_se->dl_non_contending = 1; + get_task_struct(p); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); +} + +static void task_contending(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + if (dl_se->dl_non_contending) { + dl_se->dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) + put_task_struct(dl_task_of(dl_se)); + } else { + /* + * Since "dl_non_contending" is not set, the + * task's utilization has already been removed from + * active utilization (either when the task blocked, + * when the "inactive timer" fired). + * So, add it back. + */ + add_running_bw(dl_se->dl_bw, dl_rq); + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -617,10 +772,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have changed its scheduling policy to something * different than SCHED_DEADLINE (through switched_from_dl()). */ - if (!dl_task(p)) { - __dl_clear_params(p); + if (!dl_task(p)) goto unlock; - } /* * The task might have been boosted by someone else and might be in the @@ -839,6 +992,49 @@ throttle: } } +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + inactive_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + + if (!dl_task(p) || p->state == TASK_DEAD) { + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + dl_se->dl_non_contending = 0; + } + __dl_clear_params(p); + + goto unlock; + } + if (dl_se->dl_non_contending == 0) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + + sub_running_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_non_contending = 0; +unlock: + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + + return HRTIMER_NORESTART; +} + +void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->inactive_timer; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = inactive_task_timer; +} + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -971,9 +1167,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * we want a replenishment of its runtime. */ if (flags & ENQUEUE_WAKEUP) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - - add_running_bw(dl_se->dl_bw, dl_rq); + task_contending(dl_se); update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); @@ -1042,7 +1236,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * add_running_bw(). */ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { - add_running_bw(p->dl.dl_bw, &rq->dl); + if (flags & ENQUEUE_WAKEUP) + task_contending(&p->dl); + return; } @@ -1067,7 +1263,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) sub_running_bw(p->dl.dl_bw, &rq->dl); /* - * This check allows to decrease the active utilization in two cases: + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: * when the task blocks and when it is terminating * (p->state == TASK_DEAD). We can handle the two cases in the same * way, because from GRUB's point of view the same thing is happening @@ -1075,7 +1272,7 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - sub_running_bw(p->dl.dl_bw, &rq->dl); + task_non_contending(p); } /* @@ -1153,6 +1350,35 @@ out: return cpu; } +static void migrate_task_rq_dl(struct task_struct *p) +{ + struct rq *rq; + + if (!(p->state == TASK_WAKING) || !(p->dl.dl_non_contending)) + return; + + rq = task_rq(p); + /* + * Since p->state == TASK_WAKING, set_task_cpu() has been called + * from try_to_wake_up(). Hence, p->pi_lock is locked, but + * rq->lock is not... So, lock it + */ + raw_spin_lock(&rq->lock); + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + + raw_spin_unlock(&rq->lock); +} + static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) { /* @@ -1794,13 +2020,23 @@ void __init init_sched_dl_class(void) static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* - * Start the deadline timer; if we switch back to dl before this we'll - * continue consuming our current CBS slice. If we stay outside of - * SCHED_DEADLINE until the deadline passes, the timer will reset the - * task. + * task_non_contending() can start the "inactive timer" (if the 0-lag + * time is in the future). If the task switches back to dl before + * the "inactive timer" fires, it can continue to consume its current + * runtime using its current deadline. If it stays outside of + * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer() + * will reset the task parameters. */ - if (!start_dl_timer(p)) - __dl_clear_params(p); + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + + /* + * We cannot use inactive_task_timer() to invoke sub_running_bw() + * at the 0-lag time, because the task could have been migrated + * while SCHED_OTHER in the meanwhile. + */ + if (p->dl.dl_non_contending) + p->dl.dl_non_contending = 0; /* * Since this might be the only -deadline task on the rq, @@ -1819,6 +2055,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) @@ -1893,6 +2131,7 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ee26867da339..c58f38905e0a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -244,6 +244,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } +void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); #ifdef CONFIG_CGROUP_SCHED @@ -1493,6 +1494,7 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -- cgit v1.2.3 From 387e31300b5760169e6d3f7a9e1eeed12cc5a30b Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:30 +0200 Subject: sched/deadline: Fix the update of the total -deadline utilization Now that the inactive timer can be armed to fire at the 0-lag time, it is possible to use inactive_task_timer() to update the total -deadline utilization (dl_b->total_bw) at the correct time, fixing dl_overflow() and __setparam_dl(). Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-4-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 38 ++++++++++++++------------------------ kernel/sched/deadline.c | 28 +++++++++++++--------------- 2 files changed, 27 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 968c655ec5d9..126339daebd7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2475,9 +2475,6 @@ static inline int dl_bw_cpus(int i) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. - * - * XXX we should delay bw change until the task's 0-lag point, see - * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -2502,16 +2499,29 @@ static int dl_overflow(struct task_struct *p, int policy, cpus = dl_bw_cpus(task_cpu(p)); if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { + if (hrtimer_active(&p->dl.inactive_timer)) + __dl_clear(dl_b, p->dl.dl_bw); __dl_add(dl_b, new_bw); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + /* + * XXX this is slightly incorrect: when the task + * utilization decreases, we should delay the total + * utilization change until the task's 0-lag point. + * But this would require to set the task's "inactive + * timer" when the task is not inactive. + */ __dl_clear(dl_b, p->dl.dl_bw); __dl_add(dl_b, new_bw); dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { - __dl_clear(dl_b, p->dl.dl_bw); + /* + * Do not decrease the total deadline utilization here, + * switched_from_dl() will take care to do it at the correct + * (0-lag) time. + */ err = 0; } raw_spin_unlock(&dl_b->lock); @@ -4020,26 +4030,6 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - - /* - * Changing the parameters of a task is 'tricky' and we're not doing - * the correct thing -- also see task_dead_dl() and switched_from_dl(). - * - * What we SHOULD do is delay the bandwidth release until the 0-lag - * point. This would include retaining the task_struct until that time - * and change dl_overflow() to not immediately decrement the current - * amount. - * - * Instead we retain the current runtime/deadline and let the new - * parameters take effect after the current reservation period lapses. - * This is safe (albeit pessimistic) because the 0-lag point is always - * before the current scheduling deadline. - * - * We can still have temporary overloads because we do not delay the - * change in bandwidth until that time; so admission control is - * not on the safe side. It does however guarantee tasks will never - * consume more than promised. - */ } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6480a929417c..add9cba1253c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -175,8 +175,14 @@ static void task_non_contending(struct task_struct *p) if (zerolag_time < 0) { if (dl_task(p)) sub_running_bw(dl_se->dl_bw, dl_rq); - if (!dl_task(p) || p->state == TASK_DEAD) + if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw); __dl_clear_params(p); + raw_spin_unlock(&dl_b->lock); + } return; } @@ -1004,10 +1010,16 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) rq = task_rq_lock(p, &rf); if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } + + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw); + raw_spin_unlock(&dl_b->lock); __dl_clear_params(p); goto unlock; @@ -1534,19 +1546,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void task_dead_dl(struct task_struct *p) -{ - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - /* - * Since we are TASK_DEAD we won't slip out of the domain! - */ - raw_spin_lock_irq(&dl_b->lock); - /* XXX we should retain the bw until 0-lag */ - dl_b->total_bw -= p->dl.dl_bw; - raw_spin_unlock_irq(&dl_b->lock); -} - static void set_curr_task_dl(struct rq *rq) { struct task_struct *p = rq->curr; @@ -2141,7 +2140,6 @@ const struct sched_class dl_sched_class = { .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, - .task_dead = task_dead_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, -- cgit v1.2.3 From c52f14d384628db0217a7a9080ab800d5ffb2d72 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:31 +0200 Subject: sched/deadline: Implement GRUB accounting According to the GRUB (Greedy Reclaimation of Unused Bandwidth) reclaiming algorithm, the runtime is not decreased as "dq = -dt", but as "dq = -Uact dt" (where Uact is the per-runqueue active utilization). Hence, this commit modifies the runtime accounting rule in update_curr_dl() to implement the GRUB rule. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-5-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/deadline.c | 17 +++++++++++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 126339daebd7..b68a1fa05244 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2423,7 +2423,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 20; + return BW_UNIT; /* * Doing this here saves a lot of checks in all @@ -2433,7 +2433,7 @@ unsigned long to_ratio(u64 period, u64 runtime) if (period == 0) return 0; - return div64_u64(runtime << 20, period); + return div64_u64(runtime << BW_SHIFT, period); } #ifdef CONFIG_SMP diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index add9cba1253c..0bee537554f6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -917,6 +917,22 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); +/* + * This function implements the GRUB accounting rule: + * according to the GRUB reclaiming algorithm, the runtime is + * not decreased as "dq = -dt", but as "dq = -Uact dt", where + * Uact is the (per-runqueue) active utilization. + * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result + * has to be shifted right by BW_SHIFT. + */ +u64 grub_reclaim(u64 delta, struct rq *rq) +{ + delta *= rq->dl.running_bw; + delta >>= BW_SHIFT; + + return delta; +} + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -959,6 +975,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); + delta_exec = grub_reclaim(delta_exec, rq); dl_se->runtime -= delta_exec; throttle: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c58f38905e0a..bb409ef40120 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1496,6 +1496,8 @@ extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -- cgit v1.2.3 From 4da3abcefe178c650033f371e94fa10e80bce167 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:32 +0200 Subject: sched/deadline: Do not reclaim the whole CPU bandwidth Original GRUB tends to reclaim 100% of the CPU time... And this allows a CPU hog to starve non-deadline tasks. To address this issue, allow the scheduler to reclaim only a specified fraction of CPU time, stored in the new "bw_ratio" field of the dl runqueue structure. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-6-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++++++ kernel/sched/deadline.c | 12 +++++++++++- kernel/sched/sched.h | 8 ++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b68a1fa05244..7abd06400a98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6759,6 +6759,16 @@ static int sched_dl_global_validate(void) return ret; } +void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +{ + if (global_rt_runtime() == RUNTIME_INF) { + dl_rq->bw_ratio = 1 << RATIO_SHIFT; + } else { + dl_rq->bw_ratio = to_ratio(global_rt_runtime(), + global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + } +} + static void sched_dl_do_global(void) { u64 new_bw = -1; @@ -6784,6 +6794,7 @@ static void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0bee537554f6..6a0614b9c8d7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -268,6 +268,7 @@ void init_dl_rq(struct dl_rq *dl_rq) #endif dl_rq->running_bw = 0; + init_dl_rq_bw_ratio(dl_rq); } #ifdef CONFIG_SMP @@ -924,11 +925,20 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); * Uact is the (per-runqueue) active utilization. * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result * has to be shifted right by BW_SHIFT. + * To reclaim only a fraction Umax of the CPU time, the + * runtime accounting rule is modified as + * "dq = -Uact / Umax dt"; since rq->dl.bw_ratio contains + * 2^RATIO_SHIFT / Umax, delta is multiplied by bw_ratio and shifted + * right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value + * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. + * So, overflow is not an issue here. */ u64 grub_reclaim(u64 delta, struct rq *rq) { delta *= rq->dl.running_bw; - delta >>= BW_SHIFT; + delta *= rq->dl.bw_ratio; + delta >>= BW_SHIFT + RATIO_SHIFT; return delta; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bb409ef40120..878fe757d6ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -565,6 +565,12 @@ struct dl_rq { * task blocks */ u64 running_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -1495,9 +1501,11 @@ extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -- cgit v1.2.3 From 2d4283e9d583a3ee8cfb1cbb9c1270614df4c29d Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:33 +0200 Subject: sched/deadline: Make GRUB a task's flag This patch introduces the SCHED_FLAG_RECLAIM flag to specify that a DL task is allowed to reclaim unused CPU time (using the GRUB algorithm). Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-7-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- kernel/sched/deadline.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7abd06400a98..8d1a5a625814 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4195,7 +4195,8 @@ recheck: return -EINVAL; } - if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + if (attr->sched_flags & + ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) return -EINVAL; /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a0614b9c8d7..61ea3039cdc1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -985,7 +985,8 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - delta_exec = grub_reclaim(delta_exec, rq); + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) + delta_exec = grub_reclaim(delta_exec, rq); dl_se->runtime -= delta_exec; throttle: -- cgit v1.2.3 From 8fd27231c3302e0c7e1907df1252db97b65eb241 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:34 +0200 Subject: sched/deadline: Track the "total rq utilization" too The total rq utilization is defined as the sum of the utilisations of tasks that are "assigned" to a runqueue, independently from their state (TASK_RUNNING or blocked) Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Claudio Scordino Signed-off-by: Peter Zijlstra (Intel) Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-8-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 118 ++++++++++++++++++++++++++++++++++-------------- kernel/sched/sched.h | 11 +++++ 2 files changed, 95 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 61ea3039cdc1..6c6a1f099d61 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -51,6 +51,7 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); } static inline @@ -65,25 +66,52 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw = 0; } +static inline +void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw += dl_bw; + SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ +} + +static inline +void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + if (dl_rq->this_bw > old) + dl_rq->this_bw = 0; + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +} + void dl_change_utilization(struct task_struct *p, u64 new_bw) { - if (task_on_rq_queued(p)) - return; + struct rq *rq; - if (!p->dl.dl_non_contending) + if (task_on_rq_queued(p)) return; - sub_running_bw(p->dl.dl_bw, &task_rq(p)->dl); - p->dl.dl_non_contending = 0; - /* - * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() - * will see that dl_not_contending is not set, and - * will not touch the rq's active utilization, - * so we are still safe. - */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + rq = task_rq(p); + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(new_bw, &rq->dl); } /* @@ -178,6 +206,8 @@ static void task_non_contending(struct task_struct *p) if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + if (p->state == TASK_DEAD) + sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_clear(dl_b, p->dl.dl_bw); __dl_clear_params(p); @@ -192,7 +222,7 @@ static void task_non_contending(struct task_struct *p) hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); } -static void task_contending(struct sched_dl_entity *dl_se) +static void task_contending(struct sched_dl_entity *dl_se, int flags) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); @@ -203,6 +233,9 @@ static void task_contending(struct sched_dl_entity *dl_se) if (dl_se->dl_runtime == 0) return; + if (flags & ENQUEUE_MIGRATED) + add_rq_bw(dl_se->dl_bw, dl_rq); + if (dl_se->dl_non_contending) { dl_se->dl_non_contending = 0; /* @@ -268,6 +301,7 @@ void init_dl_rq(struct dl_rq *dl_rq) #endif dl_rq->running_bw = 0; + dl_rq->this_bw = 0; init_dl_rq_bw_ratio(dl_rq); } @@ -1042,6 +1076,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) if (p->state == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } @@ -1207,7 +1242,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * we want a replenishment of its runtime. */ if (flags & ENQUEUE_WAKEUP) { - task_contending(dl_se); + task_contending(dl_se, flags); update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); @@ -1260,8 +1295,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) dl_check_constrained_dl(&p->dl); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { + add_rq_bw(p->dl.dl_bw, &rq->dl); add_running_bw(p->dl.dl_bw, &rq->dl); + } /* * If p is throttled, we do not enqueue it. In fact, if it exhausted @@ -1277,7 +1314,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl); + task_contending(&p->dl, flags); return; } @@ -1299,8 +1336,10 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(p->dl.dl_bw, &rq->dl); + } /* * This check allows to start the inactive timer (or to immediately @@ -1394,7 +1433,7 @@ static void migrate_task_rq_dl(struct task_struct *p) { struct rq *rq; - if (!(p->state == TASK_WAKING) || !(p->dl.dl_non_contending)) + if (p->state != TASK_WAKING) return; rq = task_rq(p); @@ -1404,18 +1443,20 @@ static void migrate_task_rq_dl(struct task_struct *p) * rq->lock is not... So, lock it */ raw_spin_lock(&rq->lock); - sub_running_bw(p->dl.dl_bw, &rq->dl); - p->dl.dl_non_contending = 0; - /* - * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() - * will see that dl_not_contending is not set, and - * will not touch the rq's active utilization, - * so we are still safe. - */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); - + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_unlock(&rq->lock); } @@ -1858,7 +1899,9 @@ retry: deactivate_task(rq, next_task, 0); sub_running_bw(next_task->dl.dl_bw, &rq->dl); + sub_rq_bw(next_task->dl.dl_bw, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); add_running_bw(next_task->dl.dl_bw, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1948,7 +1991,9 @@ static void pull_dl_task(struct rq *this_rq) deactivate_task(src_rq, p, 0); sub_running_bw(p->dl.dl_bw, &src_rq->dl); + sub_rq_bw(p->dl.dl_bw, &src_rq->dl); set_task_cpu(p, this_cpu); + add_rq_bw(p->dl.dl_bw, &this_rq->dl); add_running_bw(p->dl.dl_bw, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2057,6 +2102,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); + if (!task_on_rq_queued(p)) + sub_rq_bw(p->dl.dl_bw, &rq->dl); + /* * We cannot use inactive_task_timer() to invoke sub_running_bw() * at the 0-lag time, because the task could have been migrated @@ -2086,9 +2134,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) put_task_struct(p); /* If p is not queued we will update its parameters at next wakeup. */ - if (!task_on_rq_queued(p)) - return; + if (!task_on_rq_queued(p)) { + add_rq_bw(p->dl.dl_bw, &rq->dl); + return; + } /* * If p is boosted we already updated its params in * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 878fe757d6ad..b7321dac03c1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -566,6 +566,17 @@ struct dl_rq { */ u64 running_bw; + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. -- cgit v1.2.3 From 9f0d1a5077399143aad7e1244bb031e29116074e Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:35 +0200 Subject: sched/deadline: Base GRUB reclaiming on the inactive utilization Instead of decreasing the runtime as "dq = -Uact dt" (eventually divided by the maximum utilization available for deadline tasks), decrease it as "dq = -max{u, (1 - Uinact)} dt", where u is the task utilization and Uinact is the "inactive utilization". In this way, the maximum fraction of CPU time that can be reclaimed is given by the total utilization of deadline tasks. This approach solves a fairness issue with "traditional" global GRUB reclaiming: using the traditional GRUB algorithm, if tasks are allocated to the various cores in a non-uniform way, the reclaiming mechanism allows some tasks to reclaim more time than others. This issue is visible starting 11 time-consuming tasks with runtime 10ms and period 30ms (total utilization 3.666) on a 4-cores system: some tasks will receive much more than the reserved runtime (thanks to the reclaiming mechanism), while other tasks will receive less than the reserved runtime. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-9-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6c6a1f099d61..7d2f05778060 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -955,26 +955,30 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as "dq = -Uact dt", where - * Uact is the (per-runqueue) active utilization. - * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result - * has to be shifted right by BW_SHIFT. - * To reclaim only a fraction Umax of the CPU time, the - * runtime accounting rule is modified as - * "dq = -Uact / Umax dt"; since rq->dl.bw_ratio contains - * 2^RATIO_SHIFT / Umax, delta is multiplied by bw_ratio and shifted - * right by RATIO_SHIFT. - * Since delta is a 64 bit variable, to have an overflow its value - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. - * So, overflow is not an issue here. + * not decreased as "dq = -dt", but as "dq = -max{u, (1 - Uinact)} dt", + * where u is the utilization of the task and Uinact is the + * (per-runqueue) inactive utilization, computed as the difference + * between the "total runqueue utilization" and the runqueue + * active utilization. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations + * multiplied by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. */ -u64 grub_reclaim(u64 delta, struct rq *rq) +u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { - delta *= rq->dl.running_bw; - delta *= rq->dl.bw_ratio; - delta >>= BW_SHIFT + RATIO_SHIFT; + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ + u64 u_act; - return delta; + /* + * Instead of computing max{u, (1 - u_inact)}, we compare + * u_inact with 1 - u, because u_inact can be larger than 1 + * (so, 1 - u_inact would be negative leading to wrong results) + */ + if (u_inact > BW_UNIT - dl_se->dl_bw) + u_act = dl_se->dl_bw; + else + u_act = BW_UNIT - u_inact; + + return (delta * u_act) >> BW_SHIFT; } /* @@ -1020,7 +1024,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) - delta_exec = grub_reclaim(delta_exec, rq); + delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); dl_se->runtime -= delta_exec; throttle: -- cgit v1.2.3 From daec5798367012951cdb54fdb5c006e4379c9ae9 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:36 +0200 Subject: sched/deadline: Reclaim bandwidth not used by dl tasks This commit introduces a per-runqueue "extra utilization" that can be reclaimed by deadline tasks. In this way, the maximum fraction of CPU time that can reclaimed by deadline tasks is fixed (and configurable) and does not depend on the total deadline utilization. The GRUB accounting rule is modified to add this "extra utilization" to the inactive utilization of the runqueue, and to avoid reclaiming more than a maximum fraction of the CPU time. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-10-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 17 ++++++++++------- kernel/sched/deadline.c | 42 +++++++++++++++++++++++++++--------------- kernel/sched/sched.h | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 72 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8d1a5a625814..799647927c4c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2444,7 +2444,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->rd->dl_bw; } -static inline int dl_bw_cpus(int i) +inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; @@ -2462,7 +2462,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->dl.dl_bw; } -static inline int dl_bw_cpus(int i) +inline int dl_bw_cpus(int i) { return 1; } @@ -2500,8 +2500,8 @@ static int dl_overflow(struct task_struct *p, int policy, if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw); - __dl_add(dl_b, new_bw); + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { @@ -2512,8 +2512,8 @@ static int dl_overflow(struct task_struct *p, int policy, * But this would require to set the task's "inactive * timer" when the task is not inactive. */ - __dl_clear(dl_b, p->dl.dl_bw); - __dl_add(dl_b, new_bw); + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { @@ -5515,7 +5515,7 @@ int task_can_attach(struct task_struct *p, * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ - __dl_add(dl_b, p->dl.dl_bw); + __dl_add(dl_b, p->dl.dl_bw, cpus); } raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); @@ -6764,9 +6764,12 @@ void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; + dl_rq->extra_bw = 1 << BW_SHIFT; } else { dl_rq->bw_ratio = to_ratio(global_rt_runtime(), global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + dl_rq->extra_bw = to_ratio(global_rt_period(), + global_rt_runtime()); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7d2f05778060..e3b25dfb74f3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -209,7 +209,7 @@ static void task_non_contending(struct task_struct *p) if (p->state == TASK_DEAD) sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); } @@ -955,28 +955,40 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as "dq = -max{u, (1 - Uinact)} dt", - * where u is the utilization of the task and Uinact is the - * (per-runqueue) inactive utilization, computed as the difference - * between the "total runqueue utilization" and the runqueue - * active utilization. + * not decreased as "dq = -dt", but as + * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * where u is the utilization of the task, Umax is the maximum reclaimable + * utilization, Uinact is the (per-runqueue) inactive utilization, computed + * as the difference between the "total runqueue utilization" and the + * runqueue active utilization, and Uextra is the (per runqueue) extra + * reclaimable utilization. * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations - * multiplied by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. + * multiplied by 2^BW_SHIFT, the result has to be shifted right by + * BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, + * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value + * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. + * So, overflow is not an issue here. */ u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; + u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; /* - * Instead of computing max{u, (1 - u_inact)}, we compare - * u_inact with 1 - u, because u_inact can be larger than 1 - * (so, 1 - u_inact would be negative leading to wrong results) + * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, + * we compare u_inact + rq->dl.extra_bw with + * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because + * u_inact + rq->dl.extra_bw can be larger than + * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative + * leading to wrong results) */ - if (u_inact > BW_UNIT - dl_se->dl_bw) - u_act = dl_se->dl_bw; + if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) + u_act = u_act_min; else - u_act = BW_UNIT - u_inact; + u_act = BW_UNIT - u_inact - rq->dl.extra_bw; return (delta * u_act) >> BW_SHIFT; } @@ -1085,7 +1097,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) } raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(p); @@ -2054,7 +2066,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw); + __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b7321dac03c1..f1e400c6403c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -219,22 +219,27 @@ static inline int dl_bandwidth_enabled(void) } extern struct dl_bw *dl_bw_of(int i); +extern int dl_bw_cpus(int i); struct dl_bw { raw_spinlock_t lock; u64 bw, total_bw; }; +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); } static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); } static inline @@ -576,6 +581,7 @@ struct dl_rq { * runqueue (inactive utilization = this_bw - running_bw). */ u64 this_bw; + u64 extra_bw; /* * Inverse of the fraction of CPU utilization that can be reclaimed @@ -1958,6 +1964,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu); static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { u64 total; -- cgit v1.2.3 From ae83b56a56f8d9643dedbee86b457fa1c5d42f59 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 10 May 2017 21:03:37 +0800 Subject: sched/deadline: Zero out positive runtime after throttling constrained tasks When a contrained task is throttled by dl_check_constrained_dl(), it may carry the remaining positive runtime, as a result when dl_task_timer() fires and calls replenish_dl_entity(), it will not be replenished correctly due to the positive dl_se->runtime. This patch assigns its runtime to 0 if positive after throttling. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Fixes: df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline) Link: http://lkml.kernel.org/r/1494421417-27550-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e3b25dfb74f3..54302cf68bb9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -941,6 +941,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; + if (dl_se->runtime > 0) + dl_se->runtime = 0; } } -- cgit v1.2.3 From 3effcb4247e74a51f5d8b775a1ee4abf87cc089a Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 29 May 2017 16:24:03 +0200 Subject: sched/deadline: Use the revised wakeup rule for suspending constrained dl tasks We have been facing some problems with self-suspending constrained deadline tasks. The main reason is that the original CBS was not designed for such sort of tasks. One problem reported by Xunlei Pang takes place when a task suspends, and then is awakened before the deadline, but so close to the deadline that its remaining runtime can cause the task to have an absolute density higher than allowed. In such situation, the original CBS assumes that the task is facing an early activation, and so it replenishes the task and set another deadline, one deadline in the future. This rule works fine for implicit deadline tasks. Moreover, it allows the system to adapt the period of a task in which the external event source suffered from a clock drift. However, this opens the window for bandwidth leakage for constrained deadline tasks. For instance, a task with the following parameters: runtime = 5 ms deadline = 7 ms [density] = 5 / 7 = 0.71 period = 1000 ms If the task runs for 1 ms, and then suspends for another 1ms, it will be awakened with the following parameters: remaining runtime = 4 laxity = 5 presenting a absolute density of 4 / 5 = 0.80. In this case, the original CBS would assume the task had an early wakeup. Then, CBS will reset the runtime, and the absolute deadline will be postponed by one relative deadline, allowing the task to run. The problem is that, if the task runs this pattern forever, it will keep receiving bandwidth, being able to run 1ms every 2ms. Following this behavior, the task would be able to run 500 ms in 1 sec. Thus running more than the 5 ms / 1 sec the admission control allowed it to run. Trying to address the self-suspending case, Luca Abeni, Giuseppe Lipari, and Juri Lelli [1] revisited the CBS in order to deal with self-suspending tasks. In the new approach, rather than replenishing/postponing the absolute deadline, the revised wakeup rule adjusts the remaining runtime, reducing it to fit into the allowed density. A revised version of the idea is: At a given time t, the maximum absolute density of a task cannot be higher than its relative density, that is: runtime / (deadline - t) <= dl_runtime / dl_deadline Knowing the laxity of a task (deadline - t), it is possible to move it to the other side of the equality, thus enabling to define max remaining runtime a task can use within the absolute deadline, without over-running the allowed density: runtime = (dl_runtime / dl_deadline) * (deadline - t) For instance, in our previous example, the task could still run: runtime = ( 5 / 7 ) * 5 runtime = 3.57 ms Without causing damage for other deadline tasks. It is note worthy that the laxity cannot be negative because that would cause a negative runtime. Thus, this patch depends on the patch: df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline") Which throttles a constrained deadline task activated after the deadline. Finally, it is also possible to use the revised wakeup rule for all other tasks, but that would require some more discussions about pros and cons. Reported-by: Xunlei Pang Signed-off-by: Daniel Bristot de Oliveira [peterz: replaced dl_is_constrained with dl_is_implicit] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Romulo Silva de Oliveira Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/5c800ab3a74a168a84ee5f3f84d12a02e11383be.1495803804.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 + kernel/sched/deadline.c | 98 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 88 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 799647927c4c..e5bd587e87f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2150,6 +2150,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + dl_se->dl_density = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; @@ -4030,6 +4031,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 54302cf68bb9..e12f85975857 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -704,13 +704,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * When a -deadline entity is queued back on the runqueue, its runtime and - * deadline might need updating. + * Revised wakeup rule [1]: For self-suspending tasks, rather then + * re-initializing task's runtime and deadline, the revised wakeup + * rule adjusts the task's runtime to avoid the task to overrun its + * density. * - * The policy here is that we update the deadline of the entity only if: - * - the current deadline is in the past, - * - using the remaining runtime with the current deadline would make - * the entity exceed its bandwidth. + * Reasoning: a task may overrun the density if: + * runtime / (deadline - t) > dl_runtime / dl_deadline + * + * Therefore, runtime can be adjusted to: + * runtime = (dl_runtime / dl_deadline) * (deadline - t) + * + * In such way that runtime will be equal to the maximum density + * the task can use without breaking any rule. + * + * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant + * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. + */ +static void +update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 laxity = dl_se->deadline - rq_clock(rq); + + /* + * If the task has deadline < period, and the deadline is in the past, + * it should already be throttled before this check. + * + * See update_dl_entity() comments for further details. + */ + WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); + + dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT; +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. + * + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + +/* + * When a deadline entity is placed in the runqueue, its runtime and deadline + * might need to be updated. This is done by a CBS wake up rule. There are two + * different rules: 1) the original CBS; and 2) the Revisited CBS. + * + * When the task is starting a new period, the Original CBS is used. In this + * case, the runtime is replenished and a new absolute deadline is set. + * + * When a task is queued before the begin of the next period, using the + * remaining runtime and deadline could make the entity to overflow, see + * dl_entity_overflow() to find more about runtime overflow. When such case + * is detected, the runtime and deadline need to be updated. + * + * If the task has an implicit deadline, i.e., deadline == period, the Original + * CBS is applied. the runtime is replenished and a new absolute deadline is + * set, as in the previous cases. + * + * However, the Original CBS does not work properly for tasks with + * deadline < period, which are said to have a constrained deadline. By + * applying the Original CBS, a constrained deadline task would be able to run + * runtime/deadline in a period. With deadline < period, the task would + * overrun the runtime/period allowed bandwidth, breaking the admission test. + * + * In order to prevent this misbehave, the Revisited CBS is used for + * constrained deadline tasks when a runtime overflow is detected. In the + * Revisited CBS, rather than replenishing & setting a new absolute deadline, + * the remaining runtime of the task is reduced to avoid runtime overflow. + * Please refer to the comments update_dl_revised_wakeup() function to find + * more about the Revised CBS rule. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -720,6 +791,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + + if (unlikely(!dl_is_implicit(dl_se) && + !dl_time_before(dl_se->deadline, rq_clock(rq)) && + !dl_se->dl_boosted)){ + update_dl_revised_wakeup(dl_se, rq); + return; + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -1274,11 +1353,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } -static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline < dl_se->dl_period; -} - static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -1310,7 +1384,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) dl_check_constrained_dl(&p->dl); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { -- cgit v1.2.3 From f5832c1998af2ca8d9947792d1c8e1816ab58e57 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 29 May 2017 17:02:57 -0400 Subject: sched/core: Omit building stop_sched_class when !SMP The stop class is invoked through stop_machine only. This is dead code on UP builds. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170529210302.26868-3-nicolas.pitre@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 4 ++-- kernel/sched/core.c | 60 +++++++++++++++++++++++++-------------------------- kernel/sched/sched.h | 4 ++++ 3 files changed, 36 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 89ab6758667b..5e4c2e7a632b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += idle_task.o fair.o rt.o deadline.o obj-y += wait.o swait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5bd587e87f8..c343b8135774 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -788,36 +788,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); - - stop->sched_class = &stop_sched_class; - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling class so that - * it can die in pieces. - */ - old_stop->sched_class = &rt_sched_class; - } -} - /* * __normal_prio - return the priority that is based on the static prio */ @@ -1588,6 +1558,36 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + #else static inline int __set_cpus_allowed_ptr(struct task_struct *p, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f1e400c6403c..f2ef759a4cb6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1453,7 +1453,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) curr->sched_class->set_curr_task(rq); } +#ifdef CONFIG_SMP #define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) -- cgit v1.2.3 From ebfa4c02fa4806bfef189e88152b833f2a732bff Mon Sep 17 00:00:00 2001 From: Aubrey Li Date: Wed, 7 Jun 2017 10:49:02 +0800 Subject: sched/idle: Add deferrable vmstat_updater back Deferrable vmstat_updater was missing in commit: c1de45ca831a ("sched/idle: Add support for tasks that inject idle") Add it back. Signed-off-by: Aubrey Li Signed-off-by: Peter Zijlstra (Intel) Cc: Aubrey Li Cc: Christoph Lameter Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1496803742-38274-1-git-send-email-aubrey.li@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ef63adce0c9c..6c23e30c0e5c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,6 +219,7 @@ static void do_idle(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { -- cgit v1.2.3 From f5694788ad8da5da41b501f3d6d2ae22379c4ef9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Sep 2016 12:15:37 +0200 Subject: rt_mutex: Add lockdep annotations Now that (PI) futexes have their own private RT-mutex interface and implementation we can easily add lockdep annotations to the existing RT-mutex interface. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex-debug.c | 6 +++++- kernel/locking/rtmutex-debug.h | 2 +- kernel/locking/rtmutex.c | 36 +++++++++++++++++++++++++++++------- kernel/locking/rtmutex.h | 2 +- 4 files changed, 36 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 58e366ad36f4..ac35e648b0e5 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -166,12 +166,16 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) memset(waiter, 0x22, sizeof(*waiter)); } -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lock->name = name; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif } diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index b585af9a1b50..5078c6ddf4a5 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -11,7 +11,7 @@ extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 28cd09e635ed..43123533e9b1 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1481,6 +1481,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); @@ -1496,9 +1497,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { + int ret; + might_sleep(); - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); @@ -1526,11 +1534,18 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) { + int ret; + might_sleep(); - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, RT_MUTEX_MIN_CHAINWALK, rt_mutex_slowlock); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1547,10 +1562,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { + int ret; + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; - return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_trylock); @@ -1561,6 +1582,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); */ void __sched rt_mutex_unlock(struct rt_mutex *lock) { + mutex_release(&lock->dep_map, 1, _RET_IP_); rt_mutex_fastunlock(lock, rt_mutex_slowunlock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); @@ -1620,7 +1642,6 @@ void rt_mutex_destroy(struct rt_mutex *lock) lock->magic = NULL; #endif } - EXPORT_SYMBOL_GPL(rt_mutex_destroy); /** @@ -1632,14 +1653,15 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); * * Initializing of a locked rt lock is not allowed */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name) +void __rt_mutex_init(struct rt_mutex *lock, const char *name, + struct lock_class_key *key) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); lock->waiters = RB_ROOT; lock->waiters_leftmost = NULL; - debug_rt_mutex_init(lock, name); + debug_rt_mutex_init(lock, name, key); } EXPORT_SYMBOL_GPL(__rt_mutex_init); @@ -1660,7 +1682,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { - __rt_mutex_init(lock, NULL); + __rt_mutex_init(lock, NULL, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 6607802efa8b..5c253caffe91 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -17,7 +17,7 @@ #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) #define debug_rt_mutex_proxy_unlock(l) do { } while (0) #define debug_rt_mutex_unlock(l) do { } while (0) -#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_init(m, n, k) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_reset_waiter(w) do { } while (0) -- cgit v1.2.3 From dac8bbbae1d0ccba96402d25deeed3a2e87992c6 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 8 Jun 2017 12:01:30 +0200 Subject: Revert "printk: fix double printing with earlycon" This reverts commit cf39bf58afdaabc0b86f141630fb3fd18190294e. The commit regression to users that define both console=ttyS1 and console=ttyS0 on the command line, see https://lkml.kernel.org/r/20170509082915.GA13236@bistromath.localdomain The kernel log messages always appeared only on one serial port. It is even documented in Documentation/admin-guide/serial-console.rst: "Note that you can only define one console per device type (serial, video)." The above mentioned commit changed the order in which the command line parameters are searched. As a result, the kernel log messages go to the last mentioned ttyS* instead of the first one. We long thought that using two console=ttyS* on the command line did not make sense. But then we realized that console= parameters were handled also by systemd, see http://0pointer.de/blog/projects/serial-console.html "By default systemd will instantiate one serial-getty@.service on the main kernel console, if it is not a virtual terminal." where "[4] If multiple kernel consoles are used simultaneously, the main console is the one listed first in /sys/class/tty/console/active, which is the last one listed on the kernel command line." This puts the original report into another light. The system is running in qemu. The first serial port is used to store the messages into a file. The second one is used to login to the system via a socket. It depends on systemd and the historic kernel behavior. By other words, systemd causes that it makes sense to define both console=ttyS1 console=ttyS0 on the command line. The kernel fix caused regression related to userspace (systemd) and need to be reverted. In addition, it went out that the fix helped only partially. The messages still were duplicated when the boot console was removed early by late_initcall(printk_late_init). Then the entire log was replayed when the same console was registered as a normal one. Link: 20170606160339.GC7604@pathway.suse.cz Cc: Aleksey Makarov Cc: Sabrina Dubroca Cc: Sudeep Holla Cc: Greg Kroah-Hartman Cc: Peter Hurley Cc: Jiri Slaby Cc: Robin Murphy , Cc: Steven Rostedt Cc: "Nair, Jayachandran" Cc: linux-serial@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reported-by: Sabrina Dubroca Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 46 ++++++++++------------------------------------ 1 file changed, 10 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 779479ac9f57..9dbceb76e6bc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -269,7 +269,6 @@ static struct console *exclusive_console; #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; -static int console_cmdline_cnt; static int preferred_console = -1; int console_set_on_cmdline; @@ -1906,25 +1905,12 @@ static int __add_preferred_console(char *name, int idx, char *options, * See if this tty is not yet registered, and * if we have a slot free. */ - for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { if (strcmp(c->name, name) == 0 && c->index == idx) { - if (brl_options) - return 0; - - /* - * Maintain an invariant that will help to find if - * the matching console is preferred, see - * register_console(): - * - * The last non-braille console is always - * the preferred one. - */ - if (i != console_cmdline_cnt - 1) - swap(console_cmdline[i], - console_cmdline[console_cmdline_cnt - 1]); - - preferred_console = console_cmdline_cnt - 1; - + if (!brl_options) + preferred_console = i; return 0; } } @@ -1937,7 +1923,6 @@ static int __add_preferred_console(char *name, int idx, char *options, braille_set_options(c, brl_options); c->index = idx; - console_cmdline_cnt++; return 0; } /* @@ -2477,23 +2462,12 @@ void register_console(struct console *newcon) } /* - * See if this console matches one we selected on the command line. - * - * There may be several entries in the console_cmdline array matching - * with the same console, one with newcon->match(), another by - * name/index: - * - * pl011,mmio,0x87e024000000,115200 -- added from SPCR - * ttyAMA0 -- added from command line - * - * Traverse the console_cmdline array in reverse order to be - * sure that if this console is preferred then it will be the first - * matching entry. We use the invariant that is maintained in - * __add_preferred_console(). + * See if this console matches one we selected on + * the command line. */ - for (i = console_cmdline_cnt - 1; i >= 0; i--) { - c = console_cmdline + i; - + for (i = 0, c = console_cmdline; + i < MAX_CMDLINECONSOLES && c->name[0]; + i++, c++) { if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ -- cgit v1.2.3 From cdf7abc4610a7f1c43d06cda246c5f748a4fd267 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 May 2017 14:03:10 +0200 Subject: srcu: Allow use of Tiny/Tree SRCU from both process and interrupt context Linu Cherian reported a WARN in cleanup_srcu_struct() when shutting down a guest running iperf on a VFIO assigned device. This happens because irqfd_wakeup() calls srcu_read_lock(&kvm->irq_srcu) in interrupt context, while a worker thread does the same inside kvm_set_irq(). If the interrupt happens while the worker thread is executing __srcu_read_lock(), updates to the Classic SRCU ->lock_count[] field or the Tree SRCU ->srcu_lock_count[] field can be lost. The docs say you are not supposed to call srcu_read_lock() and srcu_read_unlock() from irq context, but KVM interrupt injection happens from (host) interrupt context and it would be nice if SRCU supported the use case. KVM is using SRCU here not really for the "sleepable" part, but rather due to its IPI-free fast detection of grace periods. It is therefore not desirable to switch back to RCU, which would effectively revert commit 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING", 2014-01-16). However, the docs are overly conservative. You can have an SRCU instance only has users in irq context, and you can mix process and irq context as long as process context users disable interrupts. In addition, __srcu_read_unlock() actually uses this_cpu_dec() on both Tree SRCU and Classic SRCU. For those two implementations, only srcu_read_lock() is unsafe. When Classic SRCU's __srcu_read_unlock() was changed to use this_cpu_dec(), in commit 5a41344a3d83 ("srcu: Simplify __srcu_read_unlock() via this_cpu_dec()", 2012-11-29), __srcu_read_lock() did two increments. Therefore it kept __this_cpu_inc(), with preempt_disable/enable in the caller. Tree SRCU however only does one increment, so on most architectures it is more efficient for __srcu_read_lock() to use this_cpu_inc(), and any performance differences appear to be down in the noise. Unlike Classic and Tree SRCU, Tiny SRCU does increments and decrements on a single variable. Therefore, as Peter Zijlstra pointed out, Tiny SRCU's implementation already supports mixed-context use of srcu_read_lock() and srcu_read_unlock(), at least as long as uses of srcu_read_lock() and srcu_read_unlock() in each handler are nested and paired properly. In other words, it is still illegal to (say) invoke srcu_read_lock() in an interrupt handler and to invoke the matching srcu_read_unlock() in a softirq handler. Therefore, the only change required for Tiny SRCU is to its comments. Fixes: 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING") Reported-by: Linu Cherian Suggested-by: Linu Cherian Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini Cc: Linus Torvalds Signed-off-by: Paul E. McKenney Tested-by: Paolo Bonzini --- kernel/rcu/srcutiny.c | 7 ++++--- kernel/rcu/srcutree.c | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 36e1f82faed1..32798eb14853 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -97,8 +97,9 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. - * Returns an index that must be passed to the matching srcu_read_unlock(). + * srcu_struct. Can be invoked from irq/bh handlers, but the matching + * __srcu_read_unlock() must be in the same handler instance. Returns an + * index that must be passed to the matching srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *sp) { @@ -112,7 +113,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); /* * Removes the count for the old reader from the appropriate element of - * the srcu_struct. Must be called from process context. + * the srcu_struct. */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3ae8474557df..157654fa436a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -357,7 +357,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. + * srcu_struct. * Returns an index that must be passed to the matching srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *sp) @@ -365,7 +365,7 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->srcu_idx) & 0x1; - __this_cpu_inc(sp->sda->srcu_lock_count[idx]); + this_cpu_inc(sp->sda->srcu_lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -375,7 +375,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); * Removes the count for the old reader from the appropriate per-CPU * element of the srcu_struct. Note that this may well be a different * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context. */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { -- cgit v1.2.3 From 1123a6041654e8f889014659593bad4168e542c2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 May 2017 14:03:11 +0200 Subject: srcu: Allow use of Classic SRCU from both process and interrupt context Linu Cherian reported a WARN in cleanup_srcu_struct() when shutting down a guest running iperf on a VFIO assigned device. This happens because irqfd_wakeup() calls srcu_read_lock(&kvm->irq_srcu) in interrupt context, while a worker thread does the same inside kvm_set_irq(). If the interrupt happens while the worker thread is executing __srcu_read_lock(), updates to the Classic SRCU ->lock_count[] field or the Tree SRCU ->srcu_lock_count[] field can be lost. The docs say you are not supposed to call srcu_read_lock() and srcu_read_unlock() from irq context, but KVM interrupt injection happens from (host) interrupt context and it would be nice if SRCU supported the use case. KVM is using SRCU here not really for the "sleepable" part, but rather due to its IPI-free fast detection of grace periods. It is therefore not desirable to switch back to RCU, which would effectively revert commit 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING", 2014-01-16). However, the docs are overly conservative. You can have an SRCU instance only has users in irq context, and you can mix process and irq context as long as process context users disable interrupts. In addition, __srcu_read_unlock() actually uses this_cpu_dec() on both Tree SRCU and Classic SRCU. For those two implementations, only srcu_read_lock() is unsafe. When Classic SRCU's __srcu_read_unlock() was changed to use this_cpu_dec(), in commit 5a41344a3d83 ("srcu: Simplify __srcu_read_unlock() via this_cpu_dec()", 2012-11-29), __srcu_read_lock() did two increments. Therefore it kept __this_cpu_inc(), with preempt_disable/enable in the caller. Tree SRCU however only does one increment, so on most architectures it is more efficient for __srcu_read_lock() to use this_cpu_inc(), and any performance differences appear to be down in the noise. Cc: stable@vger.kernel.org Fixes: 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING") Reported-by: Linu Cherian Suggested-by: Linu Cherian Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini Cc: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 584d8a983883..dea03614263f 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -263,7 +263,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. + * srcu_struct. * Returns an index that must be passed to the matching srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *sp) @@ -271,7 +271,7 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); + this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ return idx; } @@ -281,7 +281,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); * Removes the count for the old reader from the appropriate per-CPU * element of the srcu_struct. Note that this may well be a different * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context. */ void __srcu_read_unlock(struct srcu_struct *sp, int idx) { -- cgit v1.2.3 From f92c734f02cbf10e40569facff82059ae9b61920 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Apr 2017 15:40:35 -0700 Subject: rcu: Prevent rcu_barrier() from starting needless grace periods Currently rcu_barrier() uses call_rcu() to enqueue new callbacks on each CPU with a non-empty callback list. This works, but means that rcu_barrier() forces grace periods that are not otherwise needed. The key point is that rcu_barrier() never needs to wait for a grace period, but instead only for all pre-existing callbacks to be invoked. This means that rcu_barrier()'s new callbacks should be placed in the callback-list segment containing the last pre-existing callback. This commit makes this change using the new rcu_segcblist_entrain() function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e354e475e645..657056c3e0cd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3578,8 +3578,14 @@ static void rcu_barrier_func(void *type) struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); - atomic_inc(&rsp->barrier_cpu_count); - rsp->call(&rdp->barrier_head, rcu_barrier_callback); + rdp->barrier_head.func = rcu_barrier_callback; + debug_rcu_head_queue(&rdp->barrier_head); + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + atomic_inc(&rsp->barrier_cpu_count); + } else { + debug_rcu_head_unqueue(&rdp->barrier_head); + _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + } } /* -- cgit v1.2.3 From 881ec9d209d5371c21db89ca1bb19afd3fcadab3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Apr 2017 15:16:50 -0700 Subject: srcu: Eliminate possibility of destructive counter overflow Earlier versions of Tree SRCU were subject to a counter overflow bug that could theoretically result in too-short grace periods. This commit eliminates this problem by adding an update-side memory barrier. The short explanation is that if the updater sums the unlock counts too late to see a given __srcu_read_unlock() increment, that CPU's next __srcu_read_lock() must see the new value of ->srcu_idx, thus incrementing the other bank of counters. This eliminates the possibility of destructive counter overflow as long as the srcu_read_lock() nesting level does not exceed floor(ULONG_MAX/NR_CPUS/2), which should be an eminently reasonable nesting limit, especially on 64-bit systems. Reported-by: Lance Roy Suggested-by: Lance Roy Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 157654fa436a..fceca84df6b0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -275,15 +275,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) * not mean that there are no more readers, as one could have read * the current index but not have incremented the lock counter yet. * - * Possible bug: There is no guarantee that there haven't been - * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were - * counted, meaning that this could return true even if there are - * still active readers. Since there are no memory barriers around - * srcu_flip(), the CPU is not required to increment ->srcu_idx - * before running srcu_readers_unlock_idx(), which means that there - * could be an arbitrarily large number of critical sections that - * execute after srcu_readers_unlock_idx() but use the old value - * of ->srcu_idx. + * So suppose that the updater is preempted here for so long + * that more than ULONG_MAX non-nested readers come and go in + * the meantime. It turns out that this cannot result in overflow + * because if a reader modifies its unlock count after we read it + * above, then that reader's next load of ->srcu_idx is guaranteed + * to get the new value, which will cause it to operate on the + * other bank of counters, where it cannot contribute to the + * overflow of these counters. This means that there is a maximum + * of 2*NR_CPUS increments, which cannot overflow given current + * systems, especially not on 64-bit systems. + * + * OK, how about nesting? This does impose a limit on nesting + * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, + * especially on 64-bit systems. */ return srcu_readers_lock_idx(sp, idx) == unlocks; } @@ -671,6 +676,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) */ static void srcu_flip(struct srcu_struct *sp) { + /* + * Ensure that if this updater saw a given reader's increment + * from __srcu_read_lock(), that reader was using an old value + * of ->srcu_idx. Also ensure that if a given reader sees the + * new value of ->srcu_idx, this updater's earlier scans cannot + * have seen that reader's increments (which is OK, because this + * grace period need not wait on that reader). + */ + smp_mb(); /* E */ /* Pairs with B and C. */ + WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); /* -- cgit v1.2.3 From 5b72f9643b52a5148bb8ced126e20563adfa3466 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Apr 2017 15:29:14 -0700 Subject: rcu: Complain if blocking in preemptible RCU read-side critical section Although preemptible RCU allows its read-side critical sections to be preempted, general blocking is forbidden. The reason for this is that excessive preemption times can be handled by CONFIG_RCU_BOOST=y, but a voluntarily blocked task doesn't care how high you boost its priority. Because preemptible RCU is a global mechanism, one ill-behaved reader hurts everyone. Hence the prohibition against general blocking in RCU-preempt read-side critical sections. Preemption yes, blocking no. This commit enforces this prohibition. There is a special exception for the -rt patchset (which they kindly volunteered to implement): It is OK to block (as opposed to merely being preempted) within an RCU-preempt read-side critical section, but only if the blocking is subject to priority inheritance. This exception permits CONFIG_RCU_BOOST=y to get -rt RCU readers out of trouble. Why doesn't this exception also apply to mainline's rt_mutex? Because of the possibility that someone does general blocking while holding an rt_mutex. Yes, the priority boosting will affect the rt_mutex, but it won't help with the task doing general blocking while holding that rt_mutex. Reported-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 657056c3e0cd..9ce682242e99 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -478,7 +478,7 @@ void rcu_note_context_switch(bool preempt) barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); - rcu_preempt_note_context_switch(); + rcu_preempt_note_context_switch(preempt); /* Load rcu_urgent_qs before other flags. */ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) goto out; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ba38262c3554..0fa7aee9ef55 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -477,7 +477,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -static void rcu_preempt_note_context_switch(void); +static void rcu_preempt_note_context_switch(bool preempt); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c9a48657512a..a421753e8e9c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -286,12 +286,13 @@ static void rcu_preempt_qs(void) * * Caller must disable interrupts. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { struct task_struct *t = current; struct rcu_data *rdp; struct rcu_node *rnp; + WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -738,7 +739,7 @@ static void __init rcu_bootup_announce(void) * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { } -- cgit v1.2.3 From 9683937df9ebf4eb62cdedb09c5f20a0760c7d80 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Apr 2017 16:12:52 -0700 Subject: rcuperf: Defer expedited/normal check to end of test Current rcuperf startup checks to see if the user asked to measure only expedited grace periods, yet constrained all grace periods to be normal, or if the user asked to measure only normal grace periods, yet constrained all grace periods to be expedited. Useless tests of this sort are aborted. Unfortunately, making RCU work through the mid-boot dead zone [1] puts RCU into expedited-only mode during that zone. Which happens to also be the exact time that rcuperf carries out the aforementioned check. So if the user asks rcuperf to measure only normal grace periods (the default), rcuperf will now always complain and terminate the test. This commit therefore moves the checks to rcu_perf_cleanup(). This has the disadvantage of failing to abort useless tests, but avoids the need to create yet another kthread and the need to do fiddly checks involving the holdoff time. (Yes, another approach is to do the checks in a late-stage init function, but that would require some way to communicate badness to rcuperf's kthreads, and seems not worth the bother.) [1] https://lwn.net/Articles/716148/ Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a86fb47e4a..ef5b1faac495 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -452,6 +452,15 @@ rcu_perf_cleanup(void) u64 *wdp; u64 *wdpp; + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (torture_cleanup_begin()) return; @@ -624,16 +633,6 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } - if (rcu_gp_is_normal() && gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), -- cgit v1.2.3 From e28371c891db29c892d85322ea27ad997cc50f72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Apr 2017 09:59:53 -0700 Subject: rcu: Remove obsolete reference to synchronize_kernel() The synchronize_kernel() primitive was removed in favor of synchronize_sched() more than a decade ago, and it seems likely that rather few kernel hackers are familiar with it. Its continued presence is therefore providing more confusion than enlightenment. This commit therefore removes the reference from the synchronize_sched() header comment, and adds the corresponding information to the synchronize_rcu(0 header comment. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ------ kernel/rcu/tree_plugin.h | 9 +++++++-- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9ce682242e99..3bee58fc23b1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3280,12 +3280,6 @@ static inline int rcu_blocking_is_gp(void) * to have executed a full memory barrier during the execution of * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. */ void synchronize_sched(void) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a421753e8e9c..3b432fa4c45b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -664,8 +664,13 @@ EXPORT_SYMBOL_GPL(call_rcu); * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. + * See the description of synchronize_sched() for more detailed + * information on memory-ordering guarantees. However, please note + * that -only- the memory-ordering guarantees apply. For example, + * synchronize_rcu() is -not- guaranteed to wait on things like code + * protected by preempt_disable(), instead, synchronize_rcu() is -only- + * guaranteed to wait on RCU read-side critical sections, that is, sections + * of code protected by rcu_read_lock(). */ void synchronize_rcu(void) { -- cgit v1.2.3 From 881ed593a323c832c2e9383effeb6a0c99859210 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Apr 2017 12:47:10 -0700 Subject: rcuperf: Add ability to performance-test call_rcu() and friends This commit upgrades rcuperf so that it can do performance testing on asynchronous grace-period primitives such as call_srcu(). There is a new rcuperf.gp_async module parameter that specifies this new behavior, with the pre-existing rcuperf.gp_exp testing expedited grace periods such as synchronize_rcu_expedited, and with the default being to test synchronous non-expedited grace periods such as synchronize_rcu(). There is also a new rcuperf.gp_async_max module parameter that specifies the maximum number of outstanding callbacks per writer kthread, defaulting to 1,000. When this limit is exceeded, the writer thread invokes the appropriate flavor of rcu_barrier() to wait for callbacks to drain. Signed-off-by: Paul E. McKenney [ paulmck: Removed the redundant initialization noted by Arnd Bergmann. ] --- kernel/rcu/rcuperf.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index ef5b1faac495..e1ce97bead94 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -59,6 +59,8 @@ MODULE_AUTHOR("Paul E. McKenney "); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); @@ -86,13 +88,16 @@ static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; static unsigned long b_rcu_perf_writer_started; static unsigned long b_rcu_perf_writer_finished; +static DEFINE_PER_CPU(atomic_t, n_async_inflight); static int rcu_perf_writer_state; #define RTWS_INIT 0 -#define RTWS_EXP_SYNC 1 -#define RTWS_SYNC 2 -#define RTWS_IDLE 2 -#define RTWS_STOPPING 3 +#define RTWS_ASYNC 1 +#define RTWS_BARRIER 2 +#define RTWS_EXP_SYNC 3 +#define RTWS_SYNC 4 +#define RTWS_IDLE 5 +#define RTWS_STOPPING 6 #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -114,6 +119,8 @@ struct rcu_perf_ops { unsigned long (*started)(void); unsigned long (*completed)(void); unsigned long (*exp_completed)(void); + void (*async)(struct rcu_head *head, rcu_callback_t func); + void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); const char *name; @@ -153,6 +160,8 @@ static struct rcu_perf_ops rcu_ops = { .started = rcu_batches_started, .completed = rcu_batches_completed, .exp_completed = rcu_exp_batches_completed, + .async = call_rcu, + .gp_barrier = rcu_barrier, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .name = "rcu" @@ -181,6 +190,8 @@ static struct rcu_perf_ops rcu_bh_ops = { .started = rcu_batches_started_bh, .completed = rcu_batches_completed_bh, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_bh, + .gp_barrier = rcu_barrier_bh, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, .name = "rcu_bh" @@ -208,6 +219,16 @@ static unsigned long srcu_perf_completed(void) return srcu_batches_completed(srcu_ctlp); } +static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + call_srcu(srcu_ctlp, head, func); +} + +static void srcu_rcu_barrier(void) +{ + srcu_barrier(srcu_ctlp); +} + static void srcu_perf_synchronize(void) { synchronize_srcu(srcu_ctlp); @@ -226,6 +247,8 @@ static struct rcu_perf_ops srcu_ops = { .started = NULL, .completed = srcu_perf_completed, .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, .sync = srcu_perf_synchronize, .exp_sync = srcu_perf_synchronize_expedited, .name = "srcu" @@ -254,6 +277,8 @@ static struct rcu_perf_ops sched_ops = { .started = rcu_batches_started_sched, .completed = rcu_batches_completed_sched, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_sched, + .gp_barrier = rcu_barrier_sched, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, .name = "sched" @@ -281,6 +306,8 @@ static struct rcu_perf_ops tasks_ops = { .readunlock = tasks_perf_read_unlock, .started = rcu_no_completed, .completed = rcu_no_completed, + .async = call_rcu_tasks, + .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .name = "tasks" @@ -343,6 +370,15 @@ rcu_perf_reader(void *arg) return 0; } +/* + * Callback function for asynchronous grace periods from rcu_perf_writer(). + */ +static void rcu_perf_async_cb(struct rcu_head *rhp) +{ + atomic_dec(this_cpu_ptr(&n_async_inflight)); + kfree(rhp); +} + /* * RCU perf writer kthread. Repeatedly does a grace period. */ @@ -352,6 +388,7 @@ rcu_perf_writer(void *arg) int i = 0; int i_max; long me = (long)arg; + struct rcu_head *rhp = NULL; struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; @@ -382,7 +419,23 @@ rcu_perf_writer(void *arg) do { wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_exp) { + if (gp_async) { +retry: + if (!rhp) + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { + rcu_perf_writer_state = RTWS_ASYNC; + atomic_inc(this_cpu_ptr(&n_async_inflight)); + cur_ops->async(rhp, rcu_perf_async_cb); + rhp = NULL; + } else if (!kthread_should_stop()) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + goto retry; + } else { + kfree(rhp); /* Because we are stopping. */ + } + } else if (gp_exp) { rcu_perf_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); } else { @@ -429,6 +482,10 @@ rcu_perf_writer(void *arg) i++; rcu_perf_wait_shutdown(); } while (!torture_must_stop()); + if (gp_async) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + } rcu_perf_writer_state = RTWS_STOPPING; writer_n_durations[me] = i_max; torture_kthread_stopping("rcu_perf_writer"); @@ -460,6 +517,8 @@ rcu_perf_cleanup(void) VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); if (rcu_gp_is_normal() && gp_exp) VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); if (torture_cleanup_begin()) return; -- cgit v1.2.3 From dcfc315b7b7a5e7668dd9cb2474708b51ab1cdb1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Apr 2017 09:53:07 -0700 Subject: rcu: Make sync_rcu_preempt_exp_done() return bool The sync_rcu_preempt_exp_done() function returns a logical expression, but its return type is nevertheless int. This commit therefore changes the return type to bool. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e513b4ab1197..dd21ca47e4b4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * * Caller must hold the rcu_state's exp_mutex. */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; -- cgit v1.2.3 From f60cb4d4c8e568c2d63d01d72e589c4f75ee4140 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Apr 2017 13:43:21 -0700 Subject: rcuperf: Add test for dynamically initialized srcu_struct This commit adds a perf_type of "srcud", which species that rcuperf test SRCU on a dynamically initialized srcu_struct. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index e1ce97bead94..5158ddba6716 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -254,6 +254,35 @@ static struct rcu_perf_ops srcu_ops = { .name = "srcu" }; +static struct srcu_struct srcud; + +static void srcu_sync_perf_init(void) +{ + srcu_ctlp = &srcud; + init_srcu_struct(srcu_ctlp); +} + +static void srcu_sync_perf_cleanup(void) +{ + cleanup_srcu_struct(srcu_ctlp); +} + +static struct rcu_perf_ops srcud_ops = { + .ptype = SRCU_FLAVOR, + .init = srcu_sync_perf_init, + .cleanup = srcu_sync_perf_cleanup, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcud" +}; + /* * Definitions for sched perf testing. */ @@ -622,7 +651,7 @@ rcu_perf_init(void) long i; int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, RCUPERF_TASKS_OPS }; -- cgit v1.2.3 From 1f4f6da1c80905830337c3ff46a2d3260dabb864 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Apr 2017 11:16:32 -0700 Subject: srcu: Make Classic and Tree SRCU announce themselves at bootup Currently, the only way to tell whether a given kernel is running Classic, Tiny, or Tree SRCU is to look at the .config file, which can easily be lost or associated with the wrong kernel. This commit therefore has Classic and Tree SRCU identify themselves at boot time. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 7 +++++++ kernel/rcu/srcutree.c | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index dea03614263f..4e3f558409a0 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -659,3 +659,10 @@ void process_srcu(struct work_struct *work) srcu_reschedule(sp); } EXPORT_SYMBOL_GPL(process_srcu); + +static int __init srcu_bootup_announce(void) +{ + pr_info("Classic SRCU implementation.\n"); + return 0; +} +early_initcall(srcu_bootup_announce); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fceca84df6b0..03d57fe9f094 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1167,3 +1167,10 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); + +static int __init srcu_bootup_announce(void) +{ + pr_info("Hierarchical SRCU implementation.\n"); + return 0; +} +early_initcall(srcu_bootup_announce); -- cgit v1.2.3 From 3ddf20c953520203c42dbed1f091ed52080e1cd2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Apr 2017 13:33:20 -0700 Subject: srcu: Shrink Tiny SRCU a bit more This commit rearranges Tiny SRCU's srcu_struct structure, substitutes u8 for bool, and shrinks counters down to short. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ae6e574d4cf5..a58592b73f19 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -609,7 +609,7 @@ static void srcu_torture_stats(void) pr_cont("\n"); #elif defined(CONFIG_TINY_SRCU) idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", torture_type, TORTURE_FLAG, idx, READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); -- cgit v1.2.3 From 492b95e59735998312f678d77a2d5fe20af6b0b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Apr 2017 16:09:15 -0700 Subject: rcuperf: Set more user-friendly defaults Common-case use of rcuperf must set rcuperf.nreaders=0 and if not built as a module, rcuperf.shutdown. This commit therefore sets the default for rcuperf.nreaders to zero and sets the default for rcuperf.shutdown to zero if rcuperf is built as a module and to one otherwise. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 5158ddba6716..49c8ed6bd2fd 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -63,9 +63,10 @@ torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nreaders, 0, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, shutdown, !IS_ENABLED(MODULE), + "Shutdown at end of performance tests."); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); static char *perf_type = "rcu"; -- cgit v1.2.3 From 820687a7b98a5031207893ff265f97c0a0ad403e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Apr 2017 15:12:56 -0700 Subject: rcuperf: Add writer_holdoff boot parameter This commit adds a writer_holdoff boot parameter to rcuperf, which is intended to be used to test Tree SRCU's auto-expediting. This boot parameter is in microseconds, and defaults to zero (that is, disabled). Set it to a bit larger than srcutree.exp_holdoff, keeping the nanosecond/microsecond conversion, to force Tree SRCU to auto-expedite more aggressively. This commit also adds documentation for this parameter, and fixes some alphabetization while in the neighborhood. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 49c8ed6bd2fd..d80f11d9f8bd 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -68,6 +68,7 @@ torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -447,6 +448,8 @@ rcu_perf_writer(void *arg) } do { + if (writer_holdoff) + udelay(writer_holdoff); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async) { -- cgit v1.2.3 From f4687d2637a4016b2eedfdb777105c95e8d6fe52 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Apr 2017 16:13:53 -0700 Subject: rcu: Add preemptibility checks in rcu_sched_qs() and rcu_bh_qs() This commit adds WARN_ON_ONCE() calls that trigger if either rcu_sched_qs() or rcu_bh_qs() are invoked with preemption enabled. In the immortal words of Peter Zijlstra: "these are much harder to ignore than comments". Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3bee58fc23b1..b01a02e7a0b7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -250,6 +250,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), @@ -265,6 +266,7 @@ void rcu_sched_qs(void) void rcu_bh_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), -- cgit v1.2.3 From 59d80fd8351b7b9a5dc7bbfa8bc4ca19f6ff3dad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 10:20:28 -0700 Subject: rcu: Print out rcupdate.c non-default boot-time settings This commit adds a rcupdate_announce_bootup_oddness() function to print out non-default values of significant kernel boot parameter settings to aid in debugging. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + kernel/rcu/update.c | 42 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3b432fa4c45b..eb5ebdce25ff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -92,6 +92,7 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); if (IS_ENABLED(CONFIG_RCU_BOOST)) pr_info("\tRCU kthread priority: %d.\n", kthread_prio); + rcupdate_announce_bootup_oddness(); } #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 273e869ca21d..82a5aa10dbc5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -560,7 +560,8 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); DEFINE_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); @@ -851,6 +852,23 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#ifdef CONFIG_TASKS_RCU + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + else + pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PROVE_RCU /* @@ -935,3 +953,25 @@ late_initcall(rcu_verify_early_boot_tests); #else void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ + +#ifndef CONFIG_TINY_RCU + +/* + * Print any significant non-default boot-time settings. + */ +void __init rcupdate_announce_bootup_oddness(void) +{ + if (rcu_normal) + pr_info("\tNo expedited grace period (rcu_normal).\n"); + else if (rcu_normal_after_boot) + pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n"); + else if (rcu_expedited) + pr_info("\tAll grace periods are expedited (rcu_expedited).\n"); + if (rcu_cpu_stall_suppress) + pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n"); + if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT) + pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout); + rcu_tasks_bootup_oddness(); +} + +#endif /* #ifndef CONFIG_TINY_RCU */ -- cgit v1.2.3 From 17c7798bea64b9d35c7b4cc14d564e6feff73ac3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 11:12:34 -0700 Subject: rcu: Update rcu_bootup_announce_oddness() This commit updates rcu_bootup_announce_oddness() to check additional Kconfig options and module/boot parameters. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++++--- kernel/rcu/tree_plugin.h | 31 ++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b01a02e7a0b7..ac8dce15fd74 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -536,9 +536,12 @@ void rcu_all_qs(void) } EXPORT_SYMBOL_GPL(rcu_all_qs); -static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static long qhimark = 10000; /* If this many pending, ignore blimit. */ -static long qlowmark = 100; /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +static long blimit = DEFAULT_RCU_BLIMIT; +#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +static long qhimark = DEFAULT_RCU_QHIMARK; +#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +static long qlowmark = DEFAULT_RCU_QLOMARK; module_param(blimit, long, 0444); module_param(qhimark, long, 0444); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index eb5ebdce25ff..9cb3dff78b6f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,7 +79,9 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); - if (IS_ENABLED(CONFIG_PROVE_RCU)) + if (IS_ENABLED(CONFIG_PROVE_RCU_REPEATEDLY)) + pr_info("\tRCU lockdep checking is permanently enabled.\n"); + else if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); @@ -90,8 +92,31 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); - if (IS_ENABLED(CONFIG_RCU_BOOST)) - pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); +#endif + if (blimit != DEFAULT_RCU_BLIMIT) + pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); + if (qhimark != DEFAULT_RCU_QHIMARK) + pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); + if (qlowmark != DEFAULT_RCU_QLOMARK) + pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (jiffies_till_first_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); + if (jiffies_till_next_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (rcu_kick_kthreads) + pr_info("\tKick kthreads if too-long grace period.\n"); + if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) + pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT)) + pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP)) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) + pr_info("\tRCU debug extended QS entry/exit.\n"); rcupdate_announce_bootup_oddness(); } -- cgit v1.2.3 From b5815e6cd3b747cacf7628a32a275aa2c0f61e06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 11:20:29 -0700 Subject: srcu: Make exp_holdoff module parameter be static Because exp_holdoff is not used outside of srcutree.c, it can be static. This commit therefore makes this change. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 03d57fe9f094..08d43736f72a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -40,7 +40,7 @@ #include "rcu.h" #include "rcu_segcblist.h" -ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ +static ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ module_param(exp_holdoff, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); -- cgit v1.2.3 From 0c8e0e3c37955d17cced37222a10c00ab47efd4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 11:24:22 -0700 Subject: srcu: Print non-default exp_holdoff values at boot time This commit makes srcu_bootup_announce() check for non-default values of the auto-expedite holdoff time exp_holdoff and print a message if so. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 08d43736f72a..0b6ea105d9f8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -40,7 +40,9 @@ #include "rcu.h" #include "rcu_segcblist.h" -static ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ +/* Holdoff in nanoseconds for auto-expediting. */ +#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) +static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; module_param(exp_holdoff, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); @@ -1171,6 +1173,8 @@ EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); static int __init srcu_bootup_announce(void) { pr_info("Hierarchical SRCU implementation.\n"); + if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) + pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); return 0; } early_initcall(srcu_bootup_announce); -- cgit v1.2.3 From c0b334c5bfa98ab104bde38da330a113a6c7dd56 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 12:32:15 -0700 Subject: rcu: Add lockdep_assert_held() teeth to tree.c Comments can be helpful, but assertions carry more force. This commit therefore adds lockdep_assert_held() and RCU_LOCKDEP_WARN() calls to enforce lock-held and interrupt-disabled preconditions. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ac8dce15fd74..121c1436a7f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -762,6 +762,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); return READ_ONCE(*fp); } @@ -773,6 +774,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { + RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) @@ -799,6 +801,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -972,6 +975,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -1679,6 +1683,8 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { + lockdep_assert_held(&rnp->lock); + /* * If RCU is idle, we just wait for the next grace period. * But we can only be sure that RCU is idle if we are looking @@ -1724,6 +1730,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + lockdep_assert_held(&rnp->lock); + /* * Pick up grace-period number for new callbacks. If this * grace period is already marked as needed, return to the caller. @@ -1850,6 +1858,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; + lockdep_assert_held(&rnp->lock); + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; @@ -1888,6 +1898,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; @@ -1914,6 +1926,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; + lockdep_assert_held(&rnp->lock); + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && !unlikely(READ_ONCE(rdp->gpwrap))) { @@ -2346,6 +2360,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2407,6 +2422,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { + lockdep_assert_held(&rcu_get_root(rsp)->lock); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2431,6 +2447,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; + lockdep_assert_held(&rnp->lock); + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { @@ -2491,6 +2509,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; + lockdep_assert_held(&rnp->lock); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2604,6 +2623,8 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs do not have orphanable callbacks. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) return; @@ -2644,6 +2665,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs are handled specially. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) @@ -2710,6 +2733,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -3703,6 +3727,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; -- cgit v1.2.3 From ea9b0c8a26a2cadfe49382d679eee88d3c4de79c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 13:19:28 -0700 Subject: rcu: Add lockdep_assert_held() teeth to tree_plugin.h Comments can be helpful, but assertions carry more force. This commit therefore adds lockdep_assert_held() and RCU_LOCKDEP_WARN() calls to enforce lock-held and interrupt-disabled preconditions. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9cb3dff78b6f..ee7cea75273e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -181,6 +181,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; + lockdep_assert_held(&rnp->lock); + /* * Decide where to queue the newly blocked task. In theory, * this could be an if-statement. In practice, when I tried @@ -289,6 +291,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ static void rcu_preempt_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), @@ -318,6 +321,7 @@ static void rcu_preempt_note_context_switch(bool preempt) struct rcu_data *rdp; struct rcu_node *rnp; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -634,6 +638,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; @@ -1024,6 +1029,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; + lockdep_assert_held(&rnp->lock); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1404,6 +1410,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { *nextevt = KTIME_MAX; return 0; @@ -1456,6 +1463,7 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || rcu_is_nocb_cpu(smp_processor_id())) return; @@ -1511,6 +1519,7 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || rcu_is_nocb_cpu(smp_processor_id())) return; @@ -2544,6 +2553,8 @@ static void rcu_sysidle_enter(int irq) unsigned long j; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_enter() invoked with irqs enabled!!!"); + /* If there are no nohz_full= CPUs, no need to track this. */ if (!tick_nohz_full_enabled()) return; @@ -2615,6 +2626,8 @@ static void rcu_sysidle_exit(int irq) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_exit() invoked with irqs enabled!!!"); + /* If there are no nohz_full= CPUs, no need to track this. */ if (!tick_nohz_full_enabled()) return; @@ -2674,6 +2687,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long j; struct rcu_dynticks *rdtp = rdp->dynticks; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_check_cpu() invoked with irqs enabled!!!"); + /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ if (!tick_nohz_full_enabled()) return; @@ -2842,6 +2857,8 @@ bool rcu_sys_is_idle(void) static struct rcu_sysidle_head rsh; int rss = READ_ONCE(full_sysidle_state); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sys_is_idle() invoked with irqs enabled!!!"); + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) return false; -- cgit v1.2.3 From d4efe6c5ad91f9a1f2f1d66b7fbfc87e320b2abc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 14:16:16 -0700 Subject: srcu: Shrink Tiny SRCU a bit In Tiny SRCU, __srcu_read_lock() is a trivial function, outweighed by its EXPORT_SYMBOL_GPL(), and on many architectures, its call sequence. This commit therefore moves it to srcutiny.h so that it can be inlined. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32798eb14853..988543721d5d 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -95,22 +95,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp) } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Can be invoked from irq/bh handlers, but the matching - * __srcu_read_unlock() must be in the same handler instance. Returns an - * index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->srcu_idx); - WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - /* * Removes the count for the old reader from the appropriate element of * the srcu_struct. -- cgit v1.2.3 From a602538e46c9c62e75f1f0be9806495c79bcda9f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 15:39:34 -0700 Subject: srcu: Add DEBUG_OBJECTS_RCU_HEAD functionality This commit adds DEBUG_OBJECTS_RCU_HEAD checking to detect call_srcu() counterparts to double-free bugs. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0b6ea105d9f8..c6e2a4a1628b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -761,6 +761,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) return true; /* With reasonable probability, idle! */ } +/* + * SRCU callback function to leak a callback. + */ +static void srcu_leak_callback(struct rcu_head *rhp) +{ +} + /* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating @@ -799,6 +806,12 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, struct srcu_data *sdp; check_init_srcu_struct(sp); + if (debug_rcu_head_queue(rhp)) { + /* Probable double call_srcu(), so leak the callback. */ + WRITE_ONCE(rhp->func, srcu_leak_callback); + WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); + return; + } rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); @@ -973,9 +986,12 @@ void srcu_barrier(struct srcu_struct *sp) spin_lock_irq(&sdp->lock); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) + &sdp->srcu_barrier_head, 0)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); + } spin_unlock_irq(&sdp->lock); } @@ -1100,6 +1116,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) spin_unlock_irq(&sdp->lock); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + debug_rcu_head_unqueue(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); -- cgit v1.2.3 From 68ab0b4263224157f4d0c0e42854169a183d7534 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 16:19:07 -0700 Subject: rcu: Make synchronize_rcu_mult() check for duplicates Currently, doing synchronize_rcu_mult(call_rcu, call_rcu) might (or might not) wait for two RCU grace periods. One approach is of course "don't do that!", but in CONFIG_PREEMPT=n kernels, synchronize_rcu_mult(call_rcu, call_rcu_sched) does exactly that. This results in an ugly #ifdef in sched_cpu_deactivate(). This commit therefore makes __wait_rcu_gp() check for duplicates, which in turn allows duplicates to be passed to synchronize_rcu_mult() without risk of waiting twice on the same type of grace period. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 82a5aa10dbc5..123a9c4b5055 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -379,6 +379,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; + int j; /* Initialize and register callbacks for each flavor specified. */ for (i = 0; i < n; i++) { @@ -390,7 +391,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, } init_rcu_head_on_stack(&rs_array[i].head); init_completion(&rs_array[i].completion); - (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); } /* Wait for all callbacks to be invoked. */ @@ -399,7 +404,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, (crcu_array[i] == call_rcu || crcu_array[i] == call_rcu_bh)) continue; - wait_for_completion(&rs_array[i].completion); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + wait_for_completion(&rs_array[i].completion); destroy_rcu_head_on_stack(&rs_array[i].head); } } -- cgit v1.2.3 From d7d34d5e46140a13f86379c87a196dc8a0b9b585 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 16:33:07 -0700 Subject: sched: Rely on synchronize_rcu_mult() de-duplication The synchronize_rcu_mult() function now detects duplicate requests for the same grace-period flavor and waits only once for each flavor. This commit therefore removes the ugly #ifdef from sched_cpu_deactivate() because synchronize_rcu_mult(call_rcu, call_rcu_sched) now does what the #ifdef used to be needed for. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner --- kernel/sched/core.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 803c3bc274c4..e91138fcde86 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5874,15 +5874,9 @@ int sched_cpu_deactivate(unsigned int cpu) * users of this state to go away such that all new such users will * observe it. * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * * Do sync before park smpboot threads to take care the rcu boost case. */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); + synchronize_rcu_mult(call_rcu, call_rcu_sched); if (!sched_smp_initialized) return 0; -- cgit v1.2.3 From 511324e462a12ea8be1a7e5fc63a992134db80d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 17:04:09 -0700 Subject: rcu: Use RCU_NOCB_WAKE rather than RCU_NOGP_WAKE The RCU_NOGP_WAKE_NOT, RCU_NOGP_WAKE, and RCU_NOGP_WAKE_FORCE flags are used to mediate wakeups for the no-CBs CPU kthreads. The "NOGP" really doesn't make any sense, so this commit does s/NOGP/NOCB/. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 6 +++--- kernel/rcu/tree_plugin.h | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0fa7aee9ef55..ddfa34d020ba 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -312,9 +312,9 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ -#define RCU_NOGP_WAKE_NOT 0 -#define RCU_NOGP_WAKE 1 -#define RCU_NOGP_WAKE_FORCE 2 +#define RCU_NOCB_WAKE_NOT 0 +#define RCU_NOCB_WAKE 1 +#define RCU_NOCB_WAKE_FORCE 2 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ee7cea75273e..0b1042545116 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1901,7 +1901,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -1915,7 +1915,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -2242,8 +2242,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } -- cgit v1.2.3 From 6b5fc3a1331810db407c9e0e673dc1837afdc9d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 20:11:09 -0700 Subject: rcu: Add memory barriers for NOCB leader wakeup Wait/wakeup operations do not guarantee ordering on their own. Instead, either locking or memory barriers are required. This commit therefore adds memory barriers to wake_nocb_leader() and nocb_leader_wait(). Signed-off-by: Paul E. McKenney Tested-by: Krister Johansen Cc: # 4.6.x --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0b1042545116..573fbe9640a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1810,6 +1810,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); } } @@ -2064,6 +2065,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; + smp_mb(); /* wakeup before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) -- cgit v1.2.3 From a68a2bb28bbf7a6dd4672a25bd87fd1b5db4fa7d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 08:34:57 -0700 Subject: rcu: Move docbook comments out of rcupdate.h The include/linux/rcupdate.h file is included by more than 200 files, so shrinking it should provide some build-time benefits. This commit therefore moves several docbook comments from rcupdate.h to kernel/rcu/update.c, kernel/rcu/tree.c, and kernel/rcu/tree_plugin.h, thus reducing the number of times that the compiler has to scan these comments. This likely provides only a small benefit, but every little bit helps. This commit also fixes a malformed bulleted list noted by the 0day Test Robot. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 42 ++++++++++++++++++++++++++++++++++++++---- kernel/rcu/tree_plugin.h | 33 +++++++++++++++++++++++++++++++-- kernel/rcu/update.c | 20 +++++++++++++++++--- 3 files changed, 86 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 121c1436a7f3..5ebc830297c1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3223,8 +3223,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, local_irq_restore(flags); } -/* - * Queue an RCU-sched callback for invocation after a grace period. +/** + * call_rcu_sched() - Queue an RCU for invocation after sched grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_sched() assumes + * that the read-side critical sections end on enabling of preemption + * or on voluntary preemption. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption. + * + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { @@ -3232,8 +3248,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu_sched); -/* - * Queue an RCU callback for invocation after a quicker grace period. +/** + * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. + * OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 573fbe9640a0..116cf8339826 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -675,8 +675,37 @@ static void rcu_preempt_do_callbacks(void) #endif /* #ifdef CONFIG_RCU_BOOST */ -/* - * Queue a preemptible-RCU callback for invocation after a grace period. +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 123a9c4b5055..84dec2c8ad1b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -576,9 +576,23 @@ module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); static struct task_struct *rcu_tasks_kthread_ptr; -/* - * Post an RCU-tasks callback. First call must be from process context - * after the scheduler if fully operational. +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), entry into idle, or transition to usermode + * execution. As such, there are no read-side primitives analogous to + * rcu_read_lock() and rcu_read_unlock() because this primitive is intended + * to determine that all tasks have passed through a safe state, not so + * much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { -- cgit v1.2.3 From 3caec62fbb313946b9be53720bbf2280bb19ec28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 09:27:15 -0700 Subject: rcu: Move rcu_expedited and rcu_normal externs from rcupdate.h The rcu_expedited and rcu_normal variables are used only by sysctl and kernel/rcu/update.c, so it does not make sense to their extern declarations in rcupdate.h. This commit therefore moves these extern declarations to update.c. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84dec2c8ad1b..00e77c470017 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,7 +62,9 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU +extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); +extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); -- cgit v1.2.3 From 25c36329a30c8cac090effe1fbae9bb916fa95fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 09:51:55 -0700 Subject: rcu: Move expediting-related access/control out of rcupdate.h The rcu_gp_is_normal(), rcu_gp_is_expedited(), rcu_expedite_gp(), and rcu_unexpedite_gp() functions are intended only for use within the RCU implementation itself -- the sysfs access is what should be used outside of RCU. This commit therefore moves the declarations for these functions to kernel/rcu/rcu.h, and also includes this file into kernel/rcu/rcutorture.c and kernel/rcu/rcuperf.c. This also has the beneficial effect of shrinking rcupdate.c a bit. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 26 ++++++++++++++++++++++++++ kernel/rcu/rcuperf.c | 2 ++ kernel/rcu/rcutorture.c | 2 ++ 3 files changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 73e16ec4054b..ceb78110db1b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -293,4 +293,30 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#ifdef CONFIG_TINY_RCU +/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ +static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ +{ + return true; +} +static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ +{ + return false; +} + +static inline void rcu_expedite_gp(void) +{ +} + +static inline void rcu_unexpedite_gp(void) +{ +} +#else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_gp_is_normal(void); /* Internal RCU use. */ +bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +void rcu_expedite_gp(void); +void rcu_unexpedite_gp(void); +void rcupdate_announce_bootup_oddness(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d80f11d9f8bd..3cc18110b612 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -48,6 +48,8 @@ #include #include +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a58592b73f19..03cdf79e73d4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,6 +52,8 @@ #include #include +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -- cgit v1.2.3 From cad7b3897279c869de61dc88133037b941f84233 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 10:22:57 -0700 Subject: rcu: Move torture-related definitions from rcupdate.h to rcu.h The include/linux/rcupdate.h file contains a number of definitions that are used only to communicate between rcutorture, rcuperf, and the RCU code itself. There is no point in having these definitions exposed globally throughout the kernel, so this commit moves them to kernel/rcu/rcu.h. This change has the added benefit of shrinking rcupdate.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ceb78110db1b..f190fc1c8215 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -319,4 +319,89 @@ void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); #endif /* #else #ifdef CONFIG_TINY_RCU */ +enum rcutorture_type { + RCU_FLAVOR, + RCU_BH_FLAVOR, + RCU_SCHED_FLAVOR, + RCU_TASKS_FLAVOR, + SRCU_FLAVOR, + INVALID_RCU_FLAVOR +}; + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed); +void rcutorture_record_test_transition(void); +void rcutorture_record_progress(unsigned long vernum); +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, + int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + *flags = 0; + *gpnum = 0; + *completed = 0; +} +static inline void rcutorture_record_test_transition(void) +{ +} +static inline void rcutorture_record_progress(unsigned long vernum) +{ +} +#ifdef CONFIG_RCU_TRACE +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) +#endif +#endif + +#ifdef CONFIG_TINY_SRCU + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->srcu_gp_seq; + *gpnum = *completed; +} + +#elif defined(CONFIG_TREE_SRCU) + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed); + +#elif defined(CONFIG_CLASSIC_SRCU) + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->completed; + *gpnum = *completed; + if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) + (*gpnum)++; +} + +#endif + #endif /* __LINUX_RCU_H */ -- cgit v1.2.3 From 791875d16e2f6e2e5b90328ccac643f512ac76c4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 11:06:05 -0700 Subject: rcu: Eliminate the unused __rcu_is_watching() function The __rcu_is_watching() function is currently not used, aside from to implement the rcu_is_watching() function. This commit therefore eliminates __rcu_is_watching(), which has the beneficial side-effect of shrinking include/linux/rcupdate.h a bit. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 13 ------------- kernel/rcu/tree.c | 19 ++++--------------- 2 files changed, 4 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index e5385731e391..2306cab2195d 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -59,19 +59,6 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL(rcu_barrier_sched); -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) - -/* - * Test whether RCU thinks that the current CPU is idle. - */ -bool notrace __rcu_is_watching(void) -{ - return true; -} -EXPORT_SYMBOL(__rcu_is_watching); - -#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5ebc830297c1..61a97164abcc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1138,23 +1138,12 @@ void rcu_nmi_exit(void) rcu_dynticks_eqs_enter(); } -/** - * __rcu_is_watching - are RCU read-side critical sections safe? - * - * Return true if RCU is watching the running CPU, which means that - * this CPU can safely enter RCU read-side critical sections. Unlike - * rcu_is_watching(), the caller of __rcu_is_watching() must have at - * least disabled preemption. - */ -bool notrace __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is idle * - * If the current CPU is in its idle loop and is neither in an interrupt + * Return true if RCU is watching the running CPU, which means that this + * CPU can safely enter RCU read-side critical sections. In other words, + * if the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ bool notrace rcu_is_watching(void) @@ -1162,7 +1151,7 @@ bool notrace rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = __rcu_is_watching(); + ret = !rcu_dynticks_curr_cpu_in_eqs(); preempt_enable_notrace(); return ret; } -- cgit v1.2.3 From 82118249d0ca4078d56d5e43172ada1567fdf946 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 11:13:24 -0700 Subject: rcu: Move the RCU_SCHEDULER_ definitions from rcupdate.h The RCU_SCHEDULER_INACTIVE, RCU_SCHEDULER_INIT, and RCU_SCHEDULER_RUNNING definitions are used only within RCU, so this commit moves them from include/linux/rcupdate.h to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f190fc1c8215..17fee2a667d9 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -319,6 +319,10 @@ void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#define RCU_SCHEDULER_INACTIVE 0 +#define RCU_SCHEDULER_INIT 1 +#define RCU_SCHEDULER_RUNNING 2 + enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, -- cgit v1.2.3 From fa3c66476975abf00c97f27b6c2b3d223f7d57f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 11:38:55 -0700 Subject: rcu: Improve __call_rcu() debug-objects error message The "__call_rcu(): Leaked duplicate callback" error message from __call_rcu() has proven to be unhelpful. This commit therefore changes it to "__call_rcu(): Double-freed CB" and adds the value of the pointer passed in. The value of the pointer improves debuggability by allowing correlation with tracing output, for example, the rcu:rcu_callback trace event. Reported-by: Vegard Nossum Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 61a97164abcc..cac24f5d3fd2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3161,9 +3161,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); if (debug_rcu_head_queue(head)) { - /* Probable double call_rcu(), so leak the callback. */ + /* + * Probable double call_rcu(), so leak the callback. + * Use rcu:rcu_callback trace event to find the previous + * time callback was passed to __call_rcu(). + */ + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); - WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); return; } head->func = func; -- cgit v1.2.3 From 3d54f7983f3e6ac9f444fa20970b1abc8f089b79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 12:25:50 -0700 Subject: rcu: Move rcu_is_nocb_cpu() from rcupdate.h to rcu.h The rcu_is_nocb_cpu() function is used only internally to RCU. This commit therefore moves its declaration from include/linux/rcupdate.h to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 17fee2a667d9..2f344662c568 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -408,4 +408,12 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif +#if defined(CONFIG_RCU_NOCB_CPU_ALL) +static inline bool rcu_is_nocb_cpu(int cpu) { return true; } +#elif defined(CONFIG_RCU_NOCB_CPU) +bool rcu_is_nocb_cpu(int cpu); +#else +static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +#endif + #endif /* __LINUX_RCU_H */ -- cgit v1.2.3 From b8989b76052eedc99b09322efd6f68816f191a1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 12:28:59 -0700 Subject: rcu: Move rcu_ftrace_dump() from rcupdate.h to rcu.h The rcu_ftrace_dump() function is used only internally to RCU. This commit therefore moves its declaration from include/linux/rcupdate.h to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 2f344662c568..cdbaa441bdac 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +/* + * Dump the ftrace buffer, but only one time per callsite per boot. + */ +#define rcu_ftrace_dump(oops_dump_mode) \ +do { \ + static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ + \ + if (!atomic_read(&___rfd_beenhere) && \ + !atomic_xchg(&___rfd_beenhere, 1)) \ + ftrace_dump(oops_dump_mode); \ +} while (0) + void rcu_early_boot_tests(void); void rcu_test_sync_prims(void); -- cgit v1.2.3 From e3c8d51e1a58c73a557eb38a9a6afb4f704a3379 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 13:37:16 -0700 Subject: rcu: Move torture-related functions out of rcutiny.h and rcutree.h The various functions similar to rcu_batches_started(), the function show_rcu_gp_kthreads(), the various functions similar to rcu_force_quiescent_state(), and the variables rcutorture_testseq and rcutorture_vernum are used only within RCU. There is therefore no point in exporting them to the kernel at large from include/linux/rcutiny.h and include/linux/rcutree.h. This commit therefore moves all of these to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index cdbaa441bdac..d849b371b32b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -420,6 +420,105 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif +#ifdef CONFIG_TINY_RCU + +/* + * Return the number of grace periods started. + */ +static inline unsigned long rcu_batches_started(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods started. + */ +static inline unsigned long rcu_batches_started_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods started. + */ +static inline unsigned long rcu_batches_started_sched(void) +{ + return 0; +} + +/* + * Return the number of grace periods completed. + */ +static inline unsigned long rcu_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods completed. + */ +static inline unsigned long rcu_batches_completed_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods completed. + */ +static inline unsigned long rcu_batches_completed_sched(void) +{ + return 0; +} + +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + +static inline void rcu_force_quiescent_state(void) +{ +} + +static inline void rcu_bh_force_quiescent_state(void) +{ +} + +static inline void rcu_sched_force_quiescent_state(void) +{ +} + +static inline void show_rcu_gp_kthreads(void) +{ +} + +#else /* #ifdef CONFIG_TINY_RCU */ +extern unsigned long rcutorture_testseq; +extern unsigned long rcutorture_vernum; +unsigned long rcu_batches_started(void); +unsigned long rcu_batches_started_bh(void); +unsigned long rcu_batches_started_sched(void); +unsigned long rcu_batches_completed(void); +unsigned long rcu_batches_completed_bh(void); +unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); +void show_rcu_gp_kthreads(void); +void rcu_force_quiescent_state(void); +void rcu_bh_force_quiescent_state(void); +void rcu_sched_force_quiescent_state(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + #if defined(CONFIG_RCU_NOCB_CPU_ALL) static inline bool rcu_is_nocb_cpu(int cpu) { return true; } #elif defined(CONFIG_RCU_NOCB_CPU) -- cgit v1.2.3 From fe21a27e8ca0937a5ac298de1f4b46382e9c5c88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 13:45:51 -0700 Subject: rcu: Move rcu_request_urgent_qs_task() out of rcutiny.h and rcutree.h The rcu_request_urgent_qs_task() function is used only within RCU, so there is no point in exporting it to the rest of the kernel from nclude/linux/rcutiny.h and include/linux/rcutree.h. This commit therefore moves this function to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index d849b371b32b..5b76a5baff2e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -335,6 +335,12 @@ void rcupdate_announce_bootup_oddness(void); #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 +#ifdef CONFIG_TINY_RCU +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } +#else /* #ifdef CONFIG_TINY_RCU */ +void rcu_request_urgent_qs_task(struct task_struct *t); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, -- cgit v1.2.3 From c350c008297643dad3c395c2fd92230142da5cf6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 15:35:32 -0700 Subject: srcu: Prevent sdp->srcu_gp_seq_needed counter wrap If a given CPU never happens to ever start an SRCU grace period, the grace-period sequence counter might wrap. If this CPU were to decide to finally start a grace period, the state of its sdp->srcu_gp_seq_needed might make it appear that it has already requested this grace period, which would prevent starting the grace period. If no other CPU ever started a grace period again, this would look like a grace-period hang. Even if some other CPU took pity and started the needed grace period, the leaf rcu_node structure's ->srcu_data_have_cbs field won't have record of the fact that this CPU has a callback pending, which would look like a very localized grace-period hang. This might seem very unlikely, but SRCU grace periods can take less than a microsecond on small systems, which means that overflow can happen in much less than an hour on a 32-bit embedded system. And embedded systems are especially likely to have long-term idle CPUs. Therefore, it makes sense to prevent this scenario from happening. This commit therefore scans each srcu_data structure occasionally, with frequency controlled by the srcutree.counter_wrap_check kernel boot parameter. This parameter can be set to something like 255 in order to exercise the counter-wrap-prevention code. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c6e2a4a1628b..cc06dbfc9692 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -45,6 +45,10 @@ static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; module_param(exp_holdoff, ulong, 0444); +/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ +static ulong counter_wrap_check = (ULONG_MAX >> 2); +module_param(counter_wrap_check, ulong, 0444); + static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); @@ -496,10 +500,13 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + int cpu; + unsigned long flags; unsigned long gpseq; int idx; int idxnext; unsigned long mask; + struct srcu_data *sdp; struct srcu_node *snp; /* Prevent more than one additional grace period. */ @@ -538,6 +545,17 @@ static void srcu_gp_end(struct srcu_struct *sp) smp_mb(); /* GP end before CB invocation. */ srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); } + + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { + sdp = per_cpu_ptr(sp->sda, cpu); + spin_lock_irqsave(&sdp->lock, flags); + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + spin_unlock_irqrestore(&sdp->lock, flags); + } } /* Callback initiation done, allow grace periods after next. */ -- cgit v1.2.3 From 5a0465e17a18c467b712a816985b7b8dd8d10c16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 May 2017 11:31:04 -0700 Subject: srcu: Shrink srcu.h by moving docbook and private function The call_srcu() docbook entry is currently in include/linux/srcu.h, which causes needless processing for each include point. This commit therefore moves this entry to kernel/rcu/srcutree.c, which the compiler reads only once. In addition, the srcu_batches_completed() function is used only within RCU and its torture-test suites. This commit therefore also moves this function's declaration from include/linux/srcutiny.h, include/linux/srcutree.h, and include/linux/srcuclassic.h to kernel/rcu/rcu.h. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 6 ++++++ kernel/rcu/srcutree.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5b76a5baff2e..74d9fc205313 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -492,6 +492,11 @@ static inline unsigned long rcu_exp_batches_completed_sched(void) return 0; } +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + static inline void rcu_force_quiescent_state(void) { } @@ -519,6 +524,7 @@ unsigned long rcu_batches_completed_bh(void); unsigned long rcu_batches_completed_sched(void); unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); +unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index cc06dbfc9692..66a998f9c5a7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -854,6 +854,23 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, srcu_funnel_exp_start(sp, sdp->mynode, s); } +/** + * call_srcu() - Queue a callback for invocation after an SRCU grace period + * @sp: srcu_struct in queue the callback + * @head: structure to be used for queueing the SRCU callback. + * @func: function to be invoked after the SRCU grace period + * + * The callback function will be invoked some time after a full SRCU + * grace period elapses, in other words after all pre-existing SRCU + * read-side critical sections have completed. However, the callback + * function might well execute concurrently with other SRCU read-side + * critical sections that started after call_srcu() was invoked. SRCU + * read-side critical sections are delimited by srcu_read_lock() and + * srcu_read_unlock(), and may be nested. + * + * The callback will be invoked from process context, but must nevertheless + * be fast and must not block. + */ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func) { -- cgit v1.2.3 From 2464dd940e23bad227c387a40eec99f7aa02ed96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 May 2017 14:29:16 -0700 Subject: srcu: Apply trivial callback lists to shrink Tiny SRCU The rcu_segcblist structure provides quite a bit of functionality, and Tiny SRCU needs almost none of it. So this commit replaces Tiny SRCU's uses of rcu_segcblist with a simple singly linked list with tail pointer. This change significantly reduces Tiny SRCU's memory footprint, more than making up for the growth caused by the creation of rcu_segcblist.c Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 +- kernel/rcu/srcutiny.c | 70 +++++++++++++++++++++++---------------------------- 2 files changed, 33 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 74d9fc205313..6a1e85bd2eac 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -398,7 +398,7 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = sp->srcu_gp_seq; + *completed = sp->srcu_idx; *gpnum = *completed; } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 988543721d5d..1a1c1047d2ed 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) sp->srcu_lock_nesting[0] = 0; sp->srcu_lock_nesting[1] = 0; init_swait_queue_head(&sp->srcu_wq); - sp->srcu_gp_seq = 0; - rcu_segcblist_init(&sp->srcu_cblist); + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; sp->srcu_gp_running = false; sp->srcu_gp_waiting = false; sp->srcu_idx = 0; @@ -88,10 +88,10 @@ void cleanup_srcu_struct(struct srcu_struct *sp) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); flush_work(&sp->srcu_work); - WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); - WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); + WARN_ON(sp->srcu_cb_head); + WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -117,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); void srcu_drive_gp(struct work_struct *wp) { int idx; - struct rcu_cblist ready_cbs; - struct srcu_struct *sp; + struct rcu_head *lh; struct rcu_head *rhp; + struct srcu_struct *sp; sp = container_of(wp, struct srcu_struct, srcu_work); - if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) + if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) return; /* Already running or nothing to do. */ - /* Tag recently arrived callbacks and wait for readers. */ + /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(sp->srcu_gp_running, true); - rcu_segcblist_accelerate(&sp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); - rcu_seq_start(&sp->srcu_gp_seq); + local_irq_disable(); + lh = sp->srcu_cb_head; + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; + local_irq_enable(); idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ - rcu_seq_end(&sp->srcu_gp_seq); - - /* Update callback list based on GP, and invoke ready callbacks. */ - rcu_segcblist_advance(&sp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); - if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { - rcu_cblist_init(&ready_cbs); - local_irq_disable(); - rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); - local_irq_enable(); - rhp = rcu_cblist_dequeue(&ready_cbs); - for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { - local_bh_disable(); - rhp->func(rhp); - local_bh_enable(); - } - local_irq_disable(); - rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); - local_irq_enable(); + + /* Invoke the callbacks we removed above. */ + while (lh) { + rhp = lh; + lh = lh->next; + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); } - WRITE_ONCE(sp->srcu_gp_running, false); /* - * If more callbacks, reschedule ourselves. This can race with - * a call_srcu() at interrupt level, but the ->srcu_gp_running - * checks will straighten that out. + * Enable rescheduling, and if there are more callbacks, + * reschedule ourselves. This can race with a call_srcu() + * at interrupt level, but the ->srcu_gp_running checks will + * straighten that out. */ - if (!rcu_segcblist_empty(&sp->srcu_cblist)) + WRITE_ONCE(sp->srcu_gp_running, false); + if (READ_ONCE(sp->srcu_cb_head)) schedule_work(&sp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -171,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp); * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; - head->func = func; + rhp->func = func; + rhp->next = NULL; local_irq_save(flags); - rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); + *sp->srcu_cb_tail = rhp; + sp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); if (!READ_ONCE(sp->srcu_gp_running)) schedule_work(&sp->srcu_work); -- cgit v1.2.3 From 681fbec881dea1848e9246d7d1ecb3b97f11026d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 May 2017 15:44:38 -0700 Subject: lockdep: Use consistent printing primitives Commit a5dd63efda3d ("lockdep: Use "WARNING" tag on lockdep splats") substituted pr_warn() for printk() in places called out by Dmitry Vyukov. However, this resulted in an ugly mix of pr_warn() and printk(). This commit therefore changes printk() to pr_warn() or pr_cont(), depending on the absence or presence of KERN_CONT. This is done in all functions that had printk() changed to pr_warn() by the aforementioned commit. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 172 +++++++++++++++++++++++------------------------ 1 file changed, 86 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c0e31bfee25c..cceb9534338a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("======================================================\n"); pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); pr_warn("------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); + pr_warn("\nwhich lock already depends on the new lock.\n\n"); + pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); @@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=====================================================\n"); pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); pr_warn("-----------------------------------------------------\n"); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, @@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr, curr->softirqs_enabled); print_lock(next); - printk("\nand this task is already holding:\n"); + pr_warn("\nand this task is already holding:\n"); print_lock(prev); - printk("which would create a new lock dependency:\n"); + pr_warn("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); - printk(KERN_CONT " ->"); + pr_cont(" ->"); print_lock_name(hlock_class(next)); - printk(KERN_CONT "\n"); + pr_cont("\n"); - printk("\nbut this new dependency connects a %s-irq-safe lock:\n", + pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); print_lock_name(backwards_entry->class); - printk("\n... which became %s-irq-safe at:\n", irqclass); + pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); - printk("\nto a %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); - printk("\n... which became %s-irq-unsafe at:\n", irqclass); - printk("..."); + pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); + pr_warn("..."); print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); - printk("\nother info that might help us debug this:\n\n"); + pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, hlock_class(prev), hlock_class(next)); lockdep_print_held_locks(curr); - printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); + pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) return 0; print_shortest_lock_dependencies(backwards_entry, prev_root); - printk("\nthe dependencies between the lock to be acquired"); - printk(" and %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nthe dependencies between the lock to be acquired"); + pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) return 0; print_shortest_lock_dependencies(forwards_entry, next_root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("============================================\n"); pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); pr_warn("--------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { - printk("\n"); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); pr_warn("----------------------------\n"); - printk("%s/%d: ", current->comm, task_pid_nr(current)); - printk("Hash chain already cached but the contents don't match!\n"); + pr_warn("%s/%d: ", current->comm, task_pid_nr(current)); + pr_warn("Hash chain already cached but the contents don't match!\n"); - printk("Held locks:"); + pr_warn("Held locks:"); print_chain_keys_held_locks(curr, hlock_next); - printk("Locks in cached chain:"); + pr_warn("Locks in cached chain:"); print_chain_keys_chain(chain); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } #endif @@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("================================\n"); pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); pr_warn("--------------------------------\n"); - printk("inconsistent {%s} -> {%s} usage.\n", + pr_warn("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, @@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, trace_softirqs_enabled(curr)); print_lock(this); - printk("{%s} state was registered at:\n", usage_str[prev_bit]); + pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_usage_bug_scenario(this); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("========================================================\n"); pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); pr_warn("--------------------------------------------------------\n"); - printk("%s/%d just changed the state of lock:\n", + pr_warn("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) - printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); + pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else - printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); + pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); print_lock_name(other->class); - printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); /* Find a middle lock (if one exists) */ depth = get_lock_depth(other); do { if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); + pr_warn("lockdep:%s bad path found in chain graph\n", __func__); break; } middle = entry; @@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); - printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) return 0; print_shortest_lock_dependencies(other, root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("==================================\n"); pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); pr_warn("----------------------------------\n"); - printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); - printk("\nbut this task is not holding:\n"); - printk("%s\n", hlock->nest_lock->name); + pr_warn("\nbut this task is not holding:\n"); + pr_warn("%s\n", hlock->nest_lock->name); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=====================================\n"); pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); pr_warn("-------------------------------------\n"); - printk("%s/%d is trying to release lock (", + pr_warn("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no more locks to release!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no more locks to release!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=================================\n"); pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); pr_warn("---------------------------------\n"); - printk("%s/%d is trying to contend lock (", + pr_warn("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no locks held!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no locks held!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; - printk("\n"); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; - printk("\n"); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4402,10 +4402,10 @@ void debug_show_all_locks(void) int unlock = 1; if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); + pr_warn("INFO: lockdep is turned off.\n"); return; } - printk("\nShowing all locks held in the system:\n"); + pr_warn("\nShowing all locks held in the system:\n"); /* * Here we try to get the tasklist_lock as hard as possible, @@ -4416,18 +4416,18 @@ void debug_show_all_locks(void) retry: if (!read_trylock(&tasklist_lock)) { if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); + pr_warn("hm, tasklist_lock locked, retrying... "); if (count) { count--; - printk(" #%d", 10-count); + pr_cont(" #%d", 10-count); mdelay(200); goto retry; } - printk(" ignoring it.\n"); + pr_cont(" ignoring it.\n"); unlock = 0; } else { if (count != 10) - printk(KERN_CONT " locked it.\n"); + pr_cont(" locked it.\n"); } do_each_thread(g, p) { @@ -4445,7 +4445,7 @@ retry: unlock = 1; } while_each_thread(g, p); - printk("\n"); + pr_warn("\n"); pr_warn("=============================================\n\n"); if (unlock) @@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; - printk("\n"); + pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); pr_warn("------------------------------------------------\n"); - printk("%s/%d is leaving the kernel with locks still held!\n", + pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } @@ -4495,14 +4495,14 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) return; #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ - printk("\n"); + pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); pr_warn("-----------------------------\n"); - printk("%s:%d %s!\n", file, line, s); - printk("\nother info that might help us debug this:\n\n"); - printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("%s:%d %s!\n", file, line, s); + pr_warn("\nother info that might help us debug this:\n\n"); + pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : !rcu_is_watching() @@ -4529,10 +4529,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) * rcu_read_lock_bh() and so on from extended quiescent states. */ if (!rcu_is_watching()) - printk("RCU used illegally from extended quiescent state!\n"); + pr_warn("RCU used illegally from extended quiescent state!\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); -- cgit v1.2.3 From bf32c76540257f9f5f2cf661dbdd8bb4a4bd8c82 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 May 2017 12:05:46 -0700 Subject: rcu: Convert rnp->lock wrappers to macros for SRCU use Use of smp_mb__after_unlock_lock() would allow SRCU to omit a full memory barrier during callback execution, so this commit converts raw_spin_lock_rcu_node() from inline functions to type-generic macros to allow them to handle locks in srcu_node structures as well as rcu_node structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ddfa34d020ba..a7f63f1074b4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -580,27 +580,22 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) * As ->lock of struct rcu_node is a __private field, therefore one should use * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. */ -static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} +#define raw_spin_lock_rcu_node(p) \ +do { \ + raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) -static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); -} +#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) -static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} +#define raw_spin_lock_irq_rcu_node(p) \ +do { \ + raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) -static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); -} +#define raw_spin_unlock_irq_rcu_node(p) \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) #define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ do { \ @@ -615,11 +610,11 @@ do { \ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ } while (0) -static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) -{ - bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); - - if (locked) - smp_mb__after_unlock_lock(); - return locked; -} +#define raw_spin_trylock_rcu_node(p) \ +({ \ + bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) -- cgit v1.2.3 From 83d40bd3bc3ab3d6b5a4a331f7667d627948a099 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 May 2017 13:28:51 -0700 Subject: rcu: Move rnp->lock wrappers for SRCU use This commit moves the now-generic rnp->lock wrapper macros from kernel/rcu/tree.h to kernel/rcu/rcu.h, thus allowing SRCU to use them. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree.h | 53 ----------------------------------------------------- 2 files changed, 53 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6a1e85bd2eac..2a75beb883c8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -303,6 +303,59 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) cpu <= rnp->grphi; \ cpu = cpumask_next((cpu), cpu_possible_mask)) +/* + * Wrappers for the rcu_node::lock acquire and release. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. + */ +#define raw_spin_lock_rcu_node(p) \ +do { \ + raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irq_rcu_node(p) \ +do { \ + raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irq_rcu_node(p) \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ +} while (0) + +#define raw_spin_trylock_rcu_node(p) \ +({ \ + bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ #ifdef CONFIG_TINY_RCU diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a7f63f1074b4..baa0bac8da2a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -565,56 +565,3 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ } #endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * Wrappers for the rcu_node::lock acquire and release. - * - * Because the rcu_nodes form a tree, the tree traversal locking will observe - * different lock values, this in turn means that an UNLOCK of one level - * followed by a LOCK of another level does not imply a full memory barrier; - * and most importantly transitivity is lost. - * - * In order to restore full ordering between tree levels, augment the regular - * lock acquire functions with smp_mb__after_unlock_lock(). - * - * As ->lock of struct rcu_node is a __private field, therefore one should use - * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. - */ -#define raw_spin_lock_rcu_node(p) \ -do { \ - raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) - -#define raw_spin_lock_irq_rcu_node(p) \ -do { \ - raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_irq_rcu_node(p) \ - raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) - -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ -} while (0) - -#define raw_spin_trylock_rcu_node(p) \ -({ \ - bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ - \ - if (___locked) \ - smp_mb__after_unlock_lock(); \ - ___locked; \ -}) -- cgit v1.2.3 From a3883df3935e10caa8297719d85fa8eaff7cabbd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 May 2017 15:00:14 -0700 Subject: srcu: Use rnp->lock wrappers to replace explicit memory barriers This commit uses TREE RCU's rnp->lock wrappers to replace a few explicit memory barriers. This change also has the advantage of making SRCU's memory-ordering properties be implemented in roughly the same way as they are in Tree RCU. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 91 ++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 66a998f9c5a7..d0ca524bf042 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -76,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) /* Each pass through this loop initializes one srcu_node structure. */ rcu_for_each_node_breadth_first(sp, snp) { - spin_lock_init(&snp->lock); + raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -110,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp_first = sp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_init(&sdp->lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; @@ -169,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); - spin_lock_init(&sp->gp_lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -186,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *sp) { - spin_lock_init(&sp->gp_lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added - * to each update-side SRCU primitive. Use ->gp_lock, which -is- + * to each update-side SRCU primitive. Use sp->lock, which -is- * compile-time initialized, to resolve races involving multiple * CPUs trying to garner first-use privileges. */ @@ -209,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); return; } init_srcu_struct_fields(sp, true); - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -411,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), - "Invoked srcu_gp_start() without ->gp_lock!"); + lockdep_assert_held(&sp->lock); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -513,7 +512,7 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_lock(&sp->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); idx = rcu_seq_state(sp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(sp); @@ -522,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp) gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) sp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -530,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp) idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { - spin_lock_irq(&snp->lock); + raw_spin_lock_irq_rcu_node(snp); cbs = false; if (snp >= sp->level[rcu_num_lvls - 1]) cbs = snp->srcu_have_cbs[idx] == gpseq; @@ -540,21 +539,19 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_gp_seq_needed_exp = gpseq; mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq(&snp->lock); - if (cbs) { - smp_mb(); /* GP end before CB invocation. */ + raw_spin_unlock_irq_rcu_node(snp); + if (cbs) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); - } /* Occasionally prevent srcu_data counter wrap. */ if (!(gpseq & counter_wrap_check)) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_irqsave(&sdp->lock, flags); + raw_spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; - spin_unlock_irqrestore(&sdp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -562,17 +559,17 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_unlock(&sp->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); /* Throttle expedited grace periods: Should be rare! */ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff ? 0 : SRCU_INTERVAL); } else { - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); } } @@ -592,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, if (rcu_seq_done(&sp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; - spin_lock_irqsave(&snp->lock, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -625,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, for (; snp != NULL; snp = snp->srcu_parent) { if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ - spin_lock_irqsave(&snp->lock, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; if (snp == sdp->mynode && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == sdp->mynode && snp_seq != s) { - smp_mb(); /* CBs after GP! */ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); @@ -647,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) snp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -670,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, queue_delayed_work(system_power_efficient_wq, &sp->work, srcu_get_delay(sp)); } - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -833,7 +829,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); - spin_lock(&sdp->lock); + raw_spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -847,7 +843,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - spin_unlock_irqrestore(&sdp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) @@ -1018,7 +1014,7 @@ void srcu_barrier(struct srcu_struct *sp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_irq(&sdp->lock); + raw_spin_lock_irq_rcu_node(sdp); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1027,7 +1023,7 @@ void srcu_barrier(struct srcu_struct *sp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); } - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ @@ -1076,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp) */ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(sp); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&sp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1135,20 +1131,19 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp = container_of(work, struct srcu_data, work.work); sp = sdp->sp; rcu_cblist_init(&ready_cbs); - spin_lock_irq(&sdp->lock); - smp_mb(); /* Old grace periods before callback invocation! */ + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1161,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - spin_lock_irq(&sdp->lock); + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1180,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) { bool pushgp = true; - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1190,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(sp); } - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); if (pushgp) queue_delayed_work(system_power_efficient_wq, &sp->work, delay); -- cgit v1.2.3 From 90040c9e3015054db7efa0101afdd446d1167fe8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 May 2017 14:36:55 -0700 Subject: rcu: Remove *_SLOW_* Kconfig options The RCU_TORTURE_TEST_SLOW_PREINIT, RCU_TORTURE_TEST_SLOW_PREINIT_DELAY, RCU_TORTURE_TEST_SLOW_PREINIT_DELAY, RCU_TORTURE_TEST_SLOW_INIT, RCU_TORTURE_TEST_SLOW_INIT_DELAY, RCU_TORTURE_TEST_SLOW_CLEANUP, and RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY Kconfig options are only useful for torture testing, and there are the rcutree.gp_cleanup_delay, rcutree.gp_init_delay, and rcutree.gp_preinit_delay kernel boot parameters that rcutorture can use instead. The effect of these parameters is to artificially slow down grace period initialization and cleanup in order to make some types of race conditions happen more often. This commit therefore simplifies Tree RCU a bit by removing the Kconfig options and adding the corresponding kernel parameters to rcutorture's .boot files instead. However, this commit also leaves out the kernel parameters for TREE02, TREE04, and TREE07 in order to have about the same number of tests slowed as not slowed. TREE01, TREE03, TREE05, and TREE06 are slowed, and the rest are not slowed. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 26 ++++++-------------------- kernel/rcu/tree_plugin.h | 6 +++--- 2 files changed, 9 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cac24f5d3fd2..bbbddd85906b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -177,26 +177,12 @@ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; -module_param(gp_preinit_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ -static const int gp_preinit_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT -static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; -module_param(gp_init_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ -static const int gp_init_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; -module_param(gp_cleanup_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ -static const int gp_cleanup_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ +static int gp_preinit_delay; +module_param(gp_preinit_delay, int, 0444); +static int gp_init_delay; +module_param(gp_init_delay, int, 0444); +static int gp_cleanup_delay; +module_param(gp_cleanup_delay, int, 0444); /* * Number of grace periods between delays, normalized by the duration of diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 116cf8339826..0553d9fed7d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -109,11 +109,11 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT)) + if (gp_preinit_delay) pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)) + if (gp_init_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP)) + if (gp_cleanup_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) pr_info("\tRCU debug extended QS entry/exit.\n"); -- cgit v1.2.3 From f7a10a975036ef9ca957bfe12ab2d4b1a46cccd1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 May 2017 15:57:16 -0700 Subject: rcu: Remove the RCU_KTHREAD_PRIO Kconfig option Anything that can be done with the RCU_KTHREAD_PRIO Kconfig option can also be done with the rcutree.kthread_prio kernel boot parameter. This commit therefore removes this Kconfig option. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Rik van Riel --- kernel/rcu/tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bbbddd85906b..187ac3f41526 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -168,11 +168,7 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ -#ifdef CONFIG_RCU_KTHREAD_PRIO -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ -- cgit v1.2.3 From fe5ac724d81a3c7803e60c2232718f212f3f38d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 May 2017 11:26:22 -0700 Subject: rcu: Remove nohz_full full-system-idle state machine The NO_HZ_FULL_SYSIDLE full-system-idle capability was added in 2013 by commit 0edd1b1784cb ("nohz_full: Add full-system-idle state machine"), but has not been used. This commit therefore removes it. If it turns out to be needed later, this commit can always be reverted. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Rik van Riel Cc: Ingo Molnar Acked-by: Linus Torvalds --- kernel/rcu/tree.c | 41 +---- kernel/rcu/tree.h | 16 -- kernel/rcu/tree_plugin.h | 429 ----------------------------------------------- kernel/time/Kconfig | 50 ------ 4 files changed, 7 insertions(+), 529 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 187ac3f41526..51d4c3acf32d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -270,10 +270,6 @@ void rcu_bh_qs(void) static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; /* @@ -546,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj); +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -854,7 +847,6 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -904,7 +896,6 @@ void rcu_irq_exit(void) trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); rdtp->dynticks_nesting--; } - rcu_sysidle_enter(1); } /* @@ -986,7 +977,6 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -1038,7 +1028,6 @@ void rcu_irq_enter(void) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(oldval, true); - rcu_sysidle_exit(1); } /* @@ -1217,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); - rcu_sysidle_check_cpu(rdp, isidle, maxj); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, @@ -1238,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, * idle state since the last call to dyntick_save_progress_counter() * for this same CPU, or by virtue of having been offline. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; bool *rnhqp; @@ -2105,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) */ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - bool isidle = false; - unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; if (first_time) { /* Collect dyntick-idle snapshots. */ - if (is_sysidle_rcu_state(rsp)) { - isidle = true; - maxj = jiffies - ULONG_MAX / 4; - } - force_qs_rnp(rsp, dyntick_save_progress_counter, - &isidle, &maxj); - rcu_sysidle_report_gp(rsp, isidle, maxj); + force_qs_rnp(rsp, dyntick_save_progress_counter); } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = true; - force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { @@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj) +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) { int cpu; unsigned long flags; @@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp, for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + if (f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } } @@ -3793,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_sysidle_init_percpu_data(rdp->dynticks); rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index baa0bac8da2a..2c112bb11aa8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -45,14 +45,6 @@ struct rcu_dynticks { bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - long long dynticks_idle_nesting; - /* irq/process nesting level from idle. */ - atomic_t dynticks_idle; /* Even value for idle, else odd. */ - /* "Idle" excludes userspace execution. */ - unsigned long dynticks_idle_jiffies; - /* End of last non-NMI non-idle period. */ -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -529,15 +521,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(int irq); -static void rcu_sysidle_exit(int irq); -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj); -static bool is_sysidle_rcu_state(struct rcu_state *rsp); -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj); static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0553d9fed7d7..f524d967f7b6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2563,429 +2563,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #endif /* #ifdef CONFIG_NO_HZ_FULL */ } - -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - -static int full_sysidle_state; /* Current system-idle state. */ -#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ -#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ -#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ -#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ -#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ - -/* - * Invoked to note exit from irq or task transition to idle. Note that - * usermode execution does -not- count as idle here! After all, we want - * to detect full-system idle states, not RCU quiescent states and grace - * periods. The caller must have disabled interrupts. - */ -static void rcu_sysidle_enter(int irq) -{ - unsigned long j; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_enter() invoked with irqs enabled!!!"); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for fully idle. */ - if (irq) { - rdtp->dynticks_idle_nesting--; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - if (rdtp->dynticks_idle_nesting != 0) - return; /* Still not fully idle. */ - } else { - if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_idle_nesting = 0; - } else { - rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - return; /* Still not fully idle. */ - } - } - - /* Record start of fully idle period. */ - j = jiffies; - WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); -} - -/* - * Unconditionally force exit from full system-idle state. This is - * invoked when a normal CPU exits idle, but must be called separately - * for the timekeeping CPU (tick_do_timer_cpu). The reason for this - * is that the timekeeping CPU is permitted to take scheduling-clock - * interrupts while the system is in system-idle state, and of course - * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock - * interrupt from any other type of interrupt. - */ -void rcu_sysidle_force_exit(void) -{ - int oldstate = READ_ONCE(full_sysidle_state); - int newoldstate; - - /* - * Each pass through the following loop attempts to exit full - * system-idle state. If contention proves to be a problem, - * a trylock-based contention tree could be used here. - */ - while (oldstate > RCU_SYSIDLE_SHORT) { - newoldstate = cmpxchg(&full_sysidle_state, - oldstate, RCU_SYSIDLE_NOT); - if (oldstate == newoldstate && - oldstate == RCU_SYSIDLE_FULL_NOTED) { - rcu_kick_nohz_cpu(tick_do_timer_cpu); - return; /* We cleared it, done! */ - } - oldstate = newoldstate; - } - smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ -} - -/* - * Invoked to note entry to irq or task transition from idle. Note that - * usermode execution does -not- count as idle here! The caller must - * have disabled interrupts. - */ -static void rcu_sysidle_exit(int irq) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_exit() invoked with irqs enabled!!!"); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for already non-idle. */ - if (irq) { - rdtp->dynticks_idle_nesting++; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - if (rdtp->dynticks_idle_nesting != 1) - return; /* Already non-idle. */ - } else { - /* - * Allow for irq misnesting. Yes, it really is possible - * to enter an irq handler then never leave it, and maybe - * also vice versa. Handle both possibilities. - */ - if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - return; /* Already non-idle. */ - } else { - rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; - } - } - - /* Record end of idle period. */ - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); - - /* - * If we are the timekeeping CPU, we are permitted to be non-idle - * during a system-idle state. This must be the case, because - * the timekeeping CPU has to take scheduling-clock interrupts - * during the time that the system is transitioning to full - * system-idle state. This means that the timekeeping CPU must - * invoke rcu_sysidle_force_exit() directly if it does anything - * more than take a scheduling-clock interrupt. - */ - if (smp_processor_id() == tick_do_timer_cpu) - return; - - /* Update system-idle state: We are clearly no longer fully idle! */ - rcu_sysidle_force_exit(); -} - -/* - * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts, - * and must be running on tick_do_timer_cpu. - */ -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ - int cur; - unsigned long j; - struct rcu_dynticks *rdtp = rdp->dynticks; - - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_check_cpu() invoked with irqs enabled!!!"); - - /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ - if (!tick_nohz_full_enabled()) - return; - - /* - * If some other CPU has already reported non-idle, if this is - * not the flavor of RCU that tracks sysidle state, or if this - * is an offline or the timekeeping CPU, nothing to do. - */ - if (!*isidle || rdp->rsp != rcu_state_p || - cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) - return; - /* Verify affinity of current kthread. */ - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); - - /* Pick up current idle and NMI-nesting counter and check. */ - cur = atomic_read(&rdtp->dynticks_idle); - if (cur & 0x1) { - *isidle = false; /* We are not idle! */ - return; - } - smp_mb(); /* Read counters before timestamps. */ - - /* Pick up timestamps. */ - j = READ_ONCE(rdtp->dynticks_idle_jiffies); - /* If this CPU entered idle more recently, update maxj timestamp. */ - if (ULONG_CMP_LT(*maxj, j)) - *maxj = j; -} - -/* - * Is this the flavor of RCU that is handling full-system idle? - */ -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return rsp == rcu_state_p; -} - -/* - * Return a delay in jiffies based on the number of CPUs, rcu_node - * leaf fanout, and jiffies tick rate. The idea is to allow larger - * systems more time to transition to full-idle state in order to - * avoid the cache thrashing that otherwise occur on the state variable. - * Really small systems (less than a couple of tens of CPUs) should - * instead use a single global atomically incremented counter, and later - * versions of this will automatically reconfigure themselves accordingly. - */ -static unsigned long rcu_sysidle_delay(void) -{ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return 0; - return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); -} - -/* - * Advance the full-system-idle state. This is invoked when all of - * the non-timekeeping CPUs are idle. - */ -static void rcu_sysidle(unsigned long j) -{ - /* Check the current state. */ - switch (READ_ONCE(full_sysidle_state)) { - case RCU_SYSIDLE_NOT: - - /* First time all are idle, so note a short idle period. */ - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); - break; - - case RCU_SYSIDLE_SHORT: - - /* - * Idle for a bit, time to advance to next state? - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); - break; - - case RCU_SYSIDLE_LONG: - - /* - * Do an additional check pass before advancing to full. - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); - break; - - default: - break; - } -} - -/* - * Found a non-idle non-timekeeping CPU, so kick the system-idle state - * back to the beginning. - */ -static void rcu_sysidle_cancel(void) -{ - smp_mb(); - if (full_sysidle_state > RCU_SYSIDLE_SHORT) - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); -} - -/* - * Update the sysidle state based on the results of a force-quiescent-state - * scan of the CPUs' dyntick-idle state. - */ -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj, bool gpkt) -{ - if (rsp != rcu_state_p) - return; /* Wrong flavor, ignore. */ - if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return; /* Running state machine from timekeeping CPU. */ - if (isidle) - rcu_sysidle(maxj); /* More idle! */ - else - rcu_sysidle_cancel(); /* Idle is over. */ -} - -/* - * Wrapper for rcu_sysidle_report() when called from the grace-period - * kthread's context. - */ -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - rcu_sysidle_report(rsp, isidle, maxj, true); -} - -/* Callback and function for forcing an RCU grace period. */ -struct rcu_sysidle_head { - struct rcu_head rh; - int inuse; -}; - -static void rcu_sysidle_cb(struct rcu_head *rhp) -{ - struct rcu_sysidle_head *rshp; - - /* - * The following memory barrier is needed to replace the - * memory barriers that would normally be in the memory - * allocator. - */ - smp_mb(); /* grace period precedes setting inuse. */ - - rshp = container_of(rhp, struct rcu_sysidle_head, rh); - WRITE_ONCE(rshp->inuse, 0); -} - -/* - * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. This is not intended to be - * called unless tick_nohz_full_enabled(). - */ -bool rcu_sys_is_idle(void) -{ - static struct rcu_sysidle_head rsh; - int rss = READ_ONCE(full_sysidle_state); - - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sys_is_idle() invoked with irqs enabled!!!"); - - if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) - return false; - - /* Handle small-system case by doing a full scan of CPUs. */ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { - int oldrss = rss - 1; - - /* - * One pass to advance to each state up to _FULL. - * Give up if any pass fails to advance the state. - */ - while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { - int cpu; - bool isidle = true; - unsigned long maxj = jiffies - ULONG_MAX / 4; - struct rcu_data *rdp; - - /* Scan all the CPUs looking for nonidle CPUs. */ - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - rcu_sysidle_check_cpu(rdp, &isidle, &maxj); - if (!isidle) - break; - } - rcu_sysidle_report(rcu_state_p, isidle, maxj, false); - oldrss = rss; - rss = READ_ONCE(full_sysidle_state); - } - } - - /* If this is the first observation of an idle period, record it. */ - if (rss == RCU_SYSIDLE_FULL) { - rss = cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); - return rss == RCU_SYSIDLE_FULL; - } - - smp_mb(); /* ensure rss load happens before later caller actions. */ - - /* If already fully idle, tell the caller (in case of races). */ - if (rss == RCU_SYSIDLE_FULL_NOTED) - return true; - - /* - * If we aren't there yet, and a grace period is not in flight, - * initiate a grace period. Either way, tell the caller that - * we are not there yet. We use an xchg() rather than an assignment - * to make up for the memory barriers that would otherwise be - * provided by the memory allocator. - */ - if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_state_p) && - !rsh.inuse && xchg(&rsh.inuse, 1) == 0) - call_rcu(&rsh.rh, rcu_sysidle_cb); - return false; -} - -/* - * Initialize dynticks sysidle state for CPUs coming online. - */ -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ - rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; -} - -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -static void rcu_sysidle_enter(int irq) -{ -} - -static void rcu_sysidle_exit(int irq) -{ -} - -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ -} - -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return false; -} - -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ -} - -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -3016,13 +2593,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - cpu = tick_do_timer_cpu; - if (cpu >= 0 && cpu < nr_cpu_ids) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ housekeeping_affine(current); -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 4008d9f95dd7..ac09bc29eb08 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL Note the boot CPU will still be kept outside the range to handle the timekeeping duty. -config NO_HZ_FULL_SYSIDLE - bool "Detect full-system idle state for full dynticks system" - depends on NO_HZ_FULL - default n - help - At least one CPU must keep the scheduling-clock tick running for - timekeeping purposes whenever there is a non-idle CPU, where - "non-idle" also includes dynticks CPUs as long as they are - running non-idle tasks. Because the underlying adaptive-tick - support cannot distinguish between all CPUs being idle and - all CPUs each running a single task in dynticks mode, the - underlying support simply ensures that there is always a CPU - handling the scheduling-clock tick, whether or not all CPUs - are idle. This Kconfig option enables scalable detection of - the all-CPUs-idle state, thus allowing the scheduling-clock - tick to be disabled when all CPUs are idle. Note that scalable - detection of the all-CPUs-idle state means that larger systems - will be slower to declare the all-CPUs-idle state. - - Say Y if you would like to help debug all-CPUs-idle detection. - - Say N if you are unsure. - -config NO_HZ_FULL_SYSIDLE_SMALL - int "Number of CPUs above which large-system approach is used" - depends on NO_HZ_FULL_SYSIDLE - range 1 NR_CPUS - default 8 - help - The full-system idle detection mechanism takes a lazy approach - on large systems, as is required to attain decent scalability. - However, on smaller systems, scalability is not anywhere near as - large a concern as is energy efficiency. The sysidle subsystem - therefore uses a fast but non-scalable algorithm for small - systems and a lazier but scalable algorithm for large systems. - This Kconfig parameter defines the number of CPUs in the largest - system that will be considered to be "small". - - The default value will be fine in most cases. Battery-powered - systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger - numbers of CPUs, and (3) are suffering from battery-lifetime - problems due to long sysidle latencies might wish to experiment - with larger values for this Kconfig parameter. On the other - hand, they might be even better served by disabling NO_HZ_FULL - entirely, given that NO_HZ_FULL is intended for HPC and - real-time workloads that at present do not tend to be run on - battery-powered systems. - - Take the default if you are unsure. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS -- cgit v1.2.3 From 4e4bea7427062ec15df7084f97728e2a44d912e3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 May 2017 15:33:23 -0700 Subject: rcu: Remove typecheck() from RCU locking wrapper functions Because raw_spin_lock_irqsave() and raw_spin_unlock_irqrestore() both do typecheck() on their flags argument, there is no point in duplicating this check in raw_spin_lock_irqsave_rcu_node() and raw_spin_unlock_irqrestore_rcu_node(). This commit therefore saves a few lines by removing this duplicated check. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 2a75beb883c8..bc55b5716c37 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -334,18 +334,14 @@ do { \ #define raw_spin_unlock_irq_rcu_node(p) \ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +#define raw_spin_lock_irqsave_rcu_node(p, flags) \ do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ smp_mb__after_unlock_lock(); \ } while (0) -#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ -} while (0) +#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ #define raw_spin_trylock_rcu_node(p) \ ({ \ -- cgit v1.2.3 From c4a09ff752e164c020bced6513e2008f992a02e6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 May 2017 14:37:19 -0700 Subject: rcu: Remove the now-obsolete PROVE_RCU_REPEATEDLY Kconfig option The PROVE_RCU_REPEATEDLY Kconfig option was initially added due to the volume of messages from PROVE_RCU: Doing just one per boot would have required excessive numbers of boots to locate them all. However, PROVE_RCU messages are now relatively rare, so there is no longer any reason to need more than one such message per boot. This commit therefore removes the PROVE_RCU_REPEATEDLY Kconfig option. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar --- kernel/locking/lockdep.c | 4 ---- kernel/rcu/tree_plugin.h | 4 +--- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index cceb9534338a..7d2499bec5fe 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4490,10 +4490,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; -#ifndef CONFIG_PROVE_RCU_REPEATEDLY - if (!debug_locks_off()) - return; -#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); pr_warn("=============================\n"); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f524d967f7b6..7f5919ab24c4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,9 +79,7 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); - if (IS_ENABLED(CONFIG_PROVE_RCU_REPEATEDLY)) - pr_info("\tRCU lockdep checking is permanently enabled.\n"); - else if (IS_ENABLED(CONFIG_PROVE_RCU)) + if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); -- cgit v1.2.3 From 7f0cd6333086ae09962791c31f0d4845a3329df9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 14 May 2017 17:06:30 -0700 Subject: srcu: Fix rcutorture-statistics typo The function srcutorture_get_gp_data() duplicated the check for sp->batch_check0.head instead of also checking sp->batch_check1.head. The only effect of this typo would be for rcutorture statistics to understate the fraction of time that an SRCU grace period was in flight, and only for Classic SRCU. This commit fixes this typo. Reported-by: David Binderman Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bc55b5716c37..27f871c88e0a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -469,7 +469,7 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, *flags = 0; *completed = sp->completed; *gpnum = *completed; - if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) + if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check1.head) (*gpnum)++; } -- cgit v1.2.3 From bd8cc5a062f41e334596edbe823e2fa0adddd1b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 May 2017 14:57:01 -0700 Subject: srcu: Remove Classic SRCU Classic SRCU was only ever intended to be a fallback in case of issues with Tree/Tiny SRCU, and the latter two are doing quite well in testing. This commit therefore removes Classic SRCU. Signed-off-by: Paul E. McKenney --- kernel/rcu/Makefile | 1 - kernel/rcu/rcu.h | 16 -- kernel/rcu/rcutorture.c | 17 -- kernel/rcu/srcu.c | 668 ------------------------------------------------ 4 files changed, 702 deletions(-) delete mode 100644 kernel/rcu/srcu.c (limited to 'kernel') diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 23803c7d5180..3945337c8ce4 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,7 +3,6 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o -obj-$(CONFIG_CLASSIC_SRCU) += srcu.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 27f871c88e0a..d06c42deee0b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -457,22 +457,6 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, unsigned long *gpnum, unsigned long *completed); -#elif defined(CONFIG_CLASSIC_SRCU) - -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) -{ - if (test_type != SRCU_FLAVOR) - return; - *flags = 0; - *completed = sp->completed; - *gpnum = *completed; - if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check1.head) - (*gpnum)++; -} - #endif #ifdef CONFIG_TINY_RCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 03cdf79e73d4..b8f7f8ce8575 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -564,31 +564,19 @@ static void srcu_torture_stats(void) int __maybe_unused cpu; int idx; -#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) #ifdef CONFIG_TREE_SRCU idx = srcu_ctlp->srcu_idx & 0x1; -#else /* #ifdef CONFIG_TREE_SRCU */ - idx = srcu_ctlp->completed & 0x1; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; -#ifdef CONFIG_TREE_SRCU struct srcu_data *counts; counts = per_cpu_ptr(srcu_ctlp->sda, cpu); u0 = counts->srcu_unlock_count[!idx]; u1 = counts->srcu_unlock_count[idx]; -#else /* #ifdef CONFIG_TREE_SRCU */ - struct srcu_array *counts; - - counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - u0 = counts->unlock_count[!idx]; - u1 = counts->unlock_count[idx]; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ /* * Make sure that a lock is always counted if the corresponding @@ -596,13 +584,8 @@ static void srcu_torture_stats(void) */ smp_rmb(); -#ifdef CONFIG_TREE_SRCU l0 = counts->srcu_lock_count[!idx]; l1 = counts->srcu_lock_count[idx]; -#else /* #ifdef CONFIG_TREE_SRCU */ - l0 = counts->lock_count[!idx]; - l1 = counts->lock_count[idx]; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ c0 = l0 - u0; c1 = l1 - u1; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c deleted file mode 100644 index 4e3f558409a0..000000000000 --- a/kernel/rcu/srcu.c +++ /dev/null @@ -1,668 +0,0 @@ -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (C) IBM Corporation, 2006 - * Copyright (C) Fujitsu, 2012 - * - * Author: Paul McKenney - * Lai Jiangshan - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rcu.h" - -/* - * Initialize an rcu_batch structure to empty. - */ -static inline void rcu_batch_init(struct rcu_batch *b) -{ - b->head = NULL; - b->tail = &b->head; -} - -/* - * Enqueue a callback onto the tail of the specified rcu_batch structure. - */ -static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) -{ - *b->tail = head; - b->tail = &head->next; -} - -/* - * Is the specified rcu_batch structure empty? - */ -static inline bool rcu_batch_empty(struct rcu_batch *b) -{ - return b->tail == &b->head; -} - -/* - * Remove the callback at the head of the specified rcu_batch structure - * and return a pointer to it, or return NULL if the structure is empty. - */ -static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) -{ - struct rcu_head *head; - - if (rcu_batch_empty(b)) - return NULL; - - head = b->head; - b->head = head->next; - if (b->tail == &head->next) - rcu_batch_init(b); - - return head; -} - -/* - * Move all callbacks from the rcu_batch structure specified by "from" to - * the structure specified by "to". - */ -static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) -{ - if (!rcu_batch_empty(from)) { - *to->tail = from->head; - to->tail = from->tail; - rcu_batch_init(from); - } -} - -static int init_srcu_struct_fields(struct srcu_struct *sp) -{ - sp->completed = 0; - spin_lock_init(&sp->queue_lock); - sp->running = false; - rcu_batch_init(&sp->batch_queue); - rcu_batch_init(&sp->batch_check0); - rcu_batch_init(&sp->batch_check1); - rcu_batch_init(&sp->batch_done); - INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_array); - return sp->per_cpu_ref ? 0 : -ENOMEM; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -int __init_srcu_struct(struct srcu_struct *sp, const char *name, - struct lock_class_key *key) -{ - /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(__init_srcu_struct); - -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/** - * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. - * - * Must invoke this on a given srcu_struct before passing that srcu_struct - * to any other function. Each srcu_struct represents a separate domain - * of SRCU protection. - */ -int init_srcu_struct(struct srcu_struct *sp) -{ - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(init_srcu_struct); - -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/* - * Returns approximate total of the readers' ->lock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[idx]); - } - return sum; -} - -/* - * Returns approximate total of the readers' ->unlock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->unlock_count[idx]); - } - return sum; -} - -/* - * Return true if the number of pre-existing readers is determined to - * be zero. - */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) -{ - unsigned long unlocks; - - unlocks = srcu_readers_unlock_idx(sp, idx); - - /* - * Make sure that a lock is always counted if the corresponding unlock - * is counted. Needs to be a smp_mb() as the read side may contain a - * read from a variable that is written to before the synchronize_srcu() - * in the write side. In this case smp_mb()s A and B act like the store - * buffering pattern. - * - * This smp_mb() also pairs with smp_mb() C to prevent accesses after the - * synchronize_srcu() from being executed before the grace period ends. - */ - smp_mb(); /* A */ - - /* - * If the locks are the same as the unlocks, then there must have - * been no readers on this index at some time in between. This does not - * mean that there are no more readers, as one could have read the - * current index but not have incremented the lock counter yet. - * - * Possible bug: There is no guarantee that there haven't been ULONG_MAX - * increments of ->lock_count[] since the unlocks were counted, meaning - * that this could return true even if there are still active readers. - * Since there are no memory barriers around srcu_flip(), the CPU is not - * required to increment ->completed before running - * srcu_readers_unlock_idx(), which means that there could be an - * arbitrarily large number of critical sections that execute after - * srcu_readers_unlock_idx() but use the old value of ->completed. - */ - return srcu_readers_lock_idx(sp, idx) == unlocks; -} - -/** - * srcu_readers_active - returns true if there are readers. and false - * otherwise - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). - * - * Note that this is not an atomic primitive, and can therefore suffer - * severe errors when invoked on an active srcu_struct. That said, it - * can be useful as an error check at cleanup time. - */ -static bool srcu_readers_active(struct srcu_struct *sp) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[0]); - sum += READ_ONCE(cpuc->lock_count[1]); - sum -= READ_ONCE(cpuc->unlock_count[0]); - sum -= READ_ONCE(cpuc->unlock_count[1]); - } - return sum; -} - -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this only after you are finished using a given srcu_struct - * that was initialized via init_srcu_struct(). This code does some - * probabalistic checking, spotting late uses of srcu_read_lock(), - * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). - * If any such late uses are detected, the per-CPU memory associated with - * the srcu_struct is simply leaked and WARN_ON() is invoked. If the - * caller frees the srcu_struct itself, a use-after-free crash will likely - * ensue, but at least there will be a warning printed. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) -{ - if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - free_percpu(sp->per_cpu_ref); - sp->per_cpu_ref = NULL; -} -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); - -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->completed) & 0x1; - this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); - smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* - * Removes the count for the old reader from the appropriate per-CPU - * element of the srcu_struct. Note that this may well be a different - * CPU than that which was incremented by the corresponding srcu_read_lock(). - */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) -{ - smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); -} -EXPORT_SYMBOL_GPL(__srcu_read_unlock); - -/* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after 10 microseconds, - * we repeatedly block for 1-millisecond time periods. This approach - * has done well in testing, so there is no need for a config parameter. - */ -#define SRCU_RETRY_CHECK_DELAY 5 -#define SYNCHRONIZE_SRCU_TRYCOUNT 2 -#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 - -/* - * @@@ Wait until all pre-existing readers complete. Such readers - * will have used the index specified by "idx". - * the caller should ensures the ->completed is not changed while checking - * and idx = (->completed & 1) ^ 1 - */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) -{ - for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) - return true; - if (--trycount <= 0) - return false; - udelay(SRCU_RETRY_CHECK_DELAY); - } -} - -/* - * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->(un)lock_count[] arrays. This allows - * us to wait for pre-existing readers in a starvation-free manner. - */ -static void srcu_flip(struct srcu_struct *sp) -{ - WRITE_ONCE(sp->completed, sp->completed + 1); - - /* - * Ensure that if the updater misses an __srcu_read_unlock() - * increment, that task's next __srcu_read_lock() will see the - * above counter update. Note that both this memory barrier - * and the one in srcu_readers_active_idx_check() provide the - * guarantee for __srcu_read_lock(). - */ - smp_mb(); /* D */ /* Pairs with C. */ -} - -/* - * Enqueue an SRCU callback on the specified srcu_struct structure, - * initiating grace-period processing if it is not already running. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing SRCU read-side critical section. On systems with - * more than one CPU, this means that when "func()" is invoked, each CPU - * is guaranteed to have executed a full memory barrier since the end of - * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing - * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() - * but before the beginning of that SRCU read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting SRCU callback function "func()", then both CPU A and CPU - * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". - * This guarantee applies even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - * - * Of course, these guarantees apply only for invocations of call_srcu(), - * srcu_read_lock(), and srcu_read_unlock() that are all passed the same - * srcu_struct structure. - */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - rcu_callback_t func) -{ - unsigned long flags; - - head->next = NULL; - head->func = func; - spin_lock_irqsave(&sp->queue_lock, flags); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - rcu_batch_queue(&sp->batch_queue, head); - if (!sp->running) { - sp->running = true; - queue_delayed_work(system_power_efficient_wq, &sp->work, 0); - } - spin_unlock_irqrestore(&sp->queue_lock, flags); -} -EXPORT_SYMBOL_GPL(call_srcu); - -static void srcu_advance_batches(struct srcu_struct *sp, int trycount); -static void srcu_reschedule(struct srcu_struct *sp); - -/* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). - */ -static void __synchronize_srcu(struct srcu_struct *sp, int trycount) -{ - struct rcu_synchronize rcu; - struct rcu_head *head = &rcu.head; - bool done = false; - - RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || - lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); - - might_sleep(); - init_completion(&rcu.completion); - - head->next = NULL; - head->func = wakeme_after_rcu; - spin_lock_irq(&sp->queue_lock); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - if (!sp->running) { - /* steal the processing owner */ - sp->running = true; - rcu_batch_queue(&sp->batch_check0, head); - spin_unlock_irq(&sp->queue_lock); - - srcu_advance_batches(sp, trycount); - if (!rcu_batch_empty(&sp->batch_done)) { - BUG_ON(sp->batch_done.head != head); - rcu_batch_dequeue(&sp->batch_done); - done = true; - } - /* give the processing owner to work_struct */ - srcu_reschedule(sp); - } else { - rcu_batch_queue(&sp->batch_queue, head); - spin_unlock_irq(&sp->queue_lock); - } - - if (!done) { - wait_for_completion(&rcu.completion); - smp_mb(); /* Caller's later accesses after GP. */ - } - -} - -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Wait for the count to drain to zero of both indexes. To avoid the - * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->completed & 1) ^ 1) to drain to zero at first, - * and then flip the completed and wait for the count of the other index. - * - * Can block; must be called from process context. - * - * Note that it is illegal to call synchronize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section, - * as long as the resulting graph of srcu_structs is acyclic. - * - * There are memory-ordering constraints implied by synchronize_srcu(). - * On systems with more than one CPU, when synchronize_srcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last corresponding SRCU-sched read-side critical section - * whose beginning preceded the call to synchronize_srcu(). In addition, - * each CPU having an SRCU read-side critical section that extends beyond - * the return from synchronize_srcu() is guaranteed to have executed a - * full memory barrier after the beginning of synchronize_srcu() and before - * the beginning of that SRCU read-side critical section. Note that these - * guarantees include CPUs that are offline, idle, or executing in user mode, - * as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_srcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_srcu(). This guarantee applies even if CPU A and CPU B - * are the same CPU, but again only if the system has more than one CPU. - * - * Of course, these memory-ordering guarantees apply only when - * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are - * passed the same srcu_struct structure. - */ -void synchronize_srcu(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) - ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT - : SYNCHRONIZE_SRCU_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu); - -/** - * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. - * - * Wait for an SRCU grace period to elapse, but be more aggressive about - * spinning rather than blocking when waiting. - * - * Note that synchronize_srcu_expedited() has the same deadlock and - * memory-ordering properties as does synchronize_srcu(). - */ -void synchronize_srcu_expedited(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); - -/** - * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - * @sp: srcu_struct on which to wait for in-flight callbacks. - */ -void srcu_barrier(struct srcu_struct *sp) -{ - synchronize_srcu(sp); -} -EXPORT_SYMBOL_GPL(srcu_barrier); - -/** - * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. - * - * Report the number of batches, correlated with, but not necessarily - * precisely the same as, the number of grace periods that have elapsed. - */ -unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return sp->completed; -} -EXPORT_SYMBOL_GPL(srcu_batches_completed); - -#define SRCU_CALLBACK_BATCH 10 -#define SRCU_INTERVAL 1 - -/* - * Move any new SRCU callbacks to the first stage of the SRCU grace - * period pipeline. - */ -static void srcu_collect_new(struct srcu_struct *sp) -{ - if (!rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - rcu_batch_move(&sp->batch_check0, &sp->batch_queue); - spin_unlock_irq(&sp->queue_lock); - } -} - -/* - * Core SRCU state machine. Advance callbacks from ->batch_check0 to - * ->batch_check1 and then to ->batch_done as readers drain. - */ -static void srcu_advance_batches(struct srcu_struct *sp, int trycount) -{ - int idx = 1 ^ (sp->completed & 1); - - /* - * Because readers might be delayed for an extended period after - * fetching ->completed for their index, at any point in time there - * might well be readers using both idx=0 and idx=1. We therefore - * need to wait for readers to clear from both index values before - * invoking a callback. - */ - - if (rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_check1)) - return; /* no callbacks need to be advanced */ - - if (!try_check_zero(sp, idx, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have already done with their - * first zero check and flip back when they were enqueued on - * ->batch_check0 in a previous invocation of srcu_advance_batches(). - * (Presumably try_check_zero() returned false during that - * invocation, leaving the callbacks stranded on ->batch_check1.) - * They are therefore ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); - - if (rcu_batch_empty(&sp->batch_check0)) - return; /* no callbacks need to be advanced */ - srcu_flip(sp); - - /* - * The callbacks in ->batch_check0 just finished their - * first check zero and flip, so move them to ->batch_check1 - * for future checking on the other idx. - */ - rcu_batch_move(&sp->batch_check1, &sp->batch_check0); - - /* - * SRCU read-side critical sections are normally short, so check - * at least twice in quick succession after a flip. - */ - trycount = trycount < 2 ? 2 : trycount; - if (!try_check_zero(sp, idx^1, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have now waited for all - * pre-existing readers using both idx values. They are therefore - * ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); -} - -/* - * Invoke a limited number of SRCU callbacks that have passed through - * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. Note that needed memory barriers have been executed - * in this task's context by srcu_readers_active_idx_check(). - */ -static void srcu_invoke_callbacks(struct srcu_struct *sp) -{ - int i; - struct rcu_head *head; - - for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { - head = rcu_batch_dequeue(&sp->batch_done); - if (!head) - break; - local_bh_disable(); - head->func(head); - local_bh_enable(); - } -} - -/* - * Finished one round of SRCU grace period. Start another if there are - * more SRCU callbacks queued, otherwise put SRCU into not-running state. - */ -static void srcu_reschedule(struct srcu_struct *sp) -{ - bool pending = true; - - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - sp->running = false; - pending = false; - } - spin_unlock_irq(&sp->queue_lock); - } - - if (pending) - queue_delayed_work(system_power_efficient_wq, - &sp->work, SRCU_INTERVAL); -} - -/* - * This is the work-queue function that handles SRCU grace periods. - */ -void process_srcu(struct work_struct *work) -{ - struct srcu_struct *sp; - - sp = container_of(work, struct srcu_struct, work.work); - - srcu_collect_new(sp); - srcu_advance_batches(sp, 1); - srcu_invoke_callbacks(sp); - srcu_reschedule(sp); -} -EXPORT_SYMBOL_GPL(process_srcu); - -static int __init srcu_bootup_announce(void) -{ - pr_info("Classic SRCU implementation.\n"); - return 0; -} -early_initcall(srcu_bootup_announce); -- cgit v1.2.3 From ae91aa0adb14dc33114d566feca2f7cb7a96b8b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 May 2017 15:30:32 -0700 Subject: rcu: Remove debugfs tracing RCU's debugfs tracing used to be the only reasonable low-level debug information available, but ftrace and event tracing has since surpassed the RCU debugfs level of usefulness. This commit therefore removes RCU's debugfs tracing. Signed-off-by: Paul E. McKenney --- kernel/rcu/Makefile | 1 - kernel/rcu/tiny_plugin.h | 45 ----- kernel/rcu/tree.h | 27 --- kernel/rcu/tree_plugin.h | 31 +-- kernel/rcu/tree_trace.c | 494 ----------------------------------------------- 5 files changed, 1 insertion(+), 597 deletions(-) delete mode 100644 kernel/rcu/tree_trace.c (limited to 'kernel') diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 3945337c8ce4..13c0fc852767 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -9,6 +9,5 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o -obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 371034e77f87..c642f23f1582 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -24,8 +24,6 @@ #include #include -#include -#include /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { @@ -87,49 +85,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) local_irq_restore(flags); } -/* - * Dump statistics for TINY_RCU, such as they are. - */ -static int show_tiny_stats(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); - seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); - return 0; -} - -static int show_tiny_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_tiny_stats, NULL); -} - -static const struct file_operations show_tiny_stats_fops = { - .owner = THIS_MODULE, - .open = show_tiny_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutiny_trace_init(void) -{ - struct dentry *retval; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &show_tiny_stats_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutiny_trace_init); - static void check_cpu_stall(struct rcu_ctrlblk *rcp) { unsigned long j; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2c112bb11aa8..9af0f31d6847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -152,19 +152,6 @@ struct rcu_node { /* Number of tasks boosted for expedited GP. */ unsigned long n_normal_boosts; /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notblocked; - /* Refused to boost: RCU RS CS still running. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -535,17 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { } #endif /* #else #ifdef CONFIG_SRCU */ #endif /* #ifndef RCU_TREE_NONCORE */ - -#ifdef CONFIG_RCU_TRACE -/* Read out queue lengths for tracing. */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ -#ifdef CONFIG_RCU_NOCB_CPU - *ql = atomic_long_read(&rdp->nocb_q_count); - *qll = atomic_long_read(&rdp->nocb_q_count_lazy); -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - *ql = 0; - *qll = 0; -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -} -#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7f5919ab24c4..43f2f8026b4a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ static void __init rcu_bootup_announce_oddness(void) { if (IS_ENABLED(CONFIG_RCU_TRACE)) - pr_info("\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", @@ -899,33 +899,6 @@ void exit_rcu(void) #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_TRACE - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ - if (!rcu_preempt_has_tasks(rnp)) - rnp->n_balk_blkd_tasks++; - else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) - rnp->n_balk_exp_gp_tasks++; - else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) - rnp->n_balk_boost_tasks++; - else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) - rnp->n_balk_notblocked++; - else if (rnp->gp_tasks != NULL && - ULONG_CMP_LT(jiffies, rnp->boost_time)) - rnp->n_balk_notyet++; - else - rnp->n_balk_nos++; -} - -#else /* #ifdef CONFIG_RCU_TRACE */ - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - static void rcu_wake_cond(struct task_struct *t, int status) { /* @@ -1058,7 +1031,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) lockdep_assert_held(&rnp->lock); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { - rnp->n_balk_exp_gp_tasks++; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1074,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { - rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c deleted file mode 100644 index 6cea17a1ea30..000000000000 --- a/kernel/rcu/tree_trace.c +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Read-Copy Update tracing for hierarchical implementation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright IBM Corporation, 2008 - * Author: Paul E. McKenney - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define RCU_TREE_NONCORE -#include "tree.h" -#include "rcu.h" - -static int r_open(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - int ret = seq_open(file, op); - if (!ret) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = inode->i_private; - } - return ret; -} - -static void *r_start(struct seq_file *m, loff_t *pos) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - *pos = cpumask_next(*pos - 1, cpu_possible_mask); - if ((*pos) < nr_cpu_ids) - return per_cpu_ptr(rsp->rda, *pos); - return NULL; -} - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return r_start(m, pos); -} - -static void r_stop(struct seq_file *m, void *v) -{ -} - -static int show_rcubarrier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d bseq: %lu\n", - atomic_read(&rsp->barrier_cpu_count), - rsp->barrier_sequence); - return 0; -} - -static int rcubarrier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcubarrier, inode->i_private); -} - -static const struct file_operations rcubarrier_fops = { - .owner = THIS_MODULE, - .open = rcubarrier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static char convert_kthread_status(unsigned int kthread_status) -{ - if (kthread_status > RCU_KTHREAD_MAX) - return '?'; - return "SRWOY"[kthread_status]; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) -{ - long ql, qll; - - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), - rdp->core_needs_qs); - seq_printf(m, " dt=%d/%llx/%d df=%lu", - rcu_dynticks_snap(rdp->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, " of=%lu", rdp->offline_fqs); - rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); - ql += rcu_segcblist_n_cbs(&rdp->cblist); - seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - qll, ql, - ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], - ".R"[!rcu_segcblist_segempty(&rdp->cblist, - RCU_NEXT_READY_TAIL)], - ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], - ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c ktl=%x", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_nocbs_invoked, - rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata(struct seq_file *m, void *v) -{ - print_one_rcu_data(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcudate_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcudata, -}; - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcudate_op); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcuexp(struct seq_file *m, void *v) -{ - int cpu; - struct rcu_state *rsp = (struct rcu_state *)m->private; - struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->exp_workdone0); - s1 += atomic_long_read(&rdp->exp_workdone1); - s2 += atomic_long_read(&rdp->exp_workdone2); - s3 += atomic_long_read(&rdp->exp_workdone3); - } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, - atomic_read(&rsp->expedited_need_qs), - rsp->expedited_sequence / 2); - return 0; -} - -static int rcuexp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuexp, inode->i_private); -} - -static const struct file_operations rcuexp_fops = { - .owner = THIS_MODULE, - .open = rcuexp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) -{ - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", - rnp->grplo, rnp->grphi, - "T."[list_empty(&rnp->blkd_tasks)], - "N."[!rnp->gp_tasks], - "E."[!rnp->exp_tasks], - "B."[!rnp->boost_tasks], - convert_kthread_status(rnp->boost_kthread_status), - rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts); - seq_printf(m, "j=%04x bt=%04x\n", - (int)(jiffies & 0xffff), - (int)(rnp->boost_time & 0xffff)); - seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - rnp->n_balk_blkd_tasks, - rnp->n_balk_exp_gp_tasks, - rnp->n_balk_boost_tasks, - rnp->n_balk_notblocked, - rnp->n_balk_notyet, - rnp->n_balk_nos); -} - -static int show_rcu_node_boost(struct seq_file *m, void *unused) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(&rcu_preempt_state, rnp) - print_one_rcu_node_boost(m, rnp); - return 0; -} - -static int rcu_node_boost_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_node_boost, NULL); -} - -static const struct file_operations rcu_node_boost_fops = { - .owner = THIS_MODULE, - .open = rcu_node_boost_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long gpnum; - int level = 0; - struct rcu_node *rnp; - - gpnum = rsp->gpnum; - seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", - ulong2long(rsp->completed), ulong2long(gpnum), - rsp->gp_state, - (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff)); - seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->n_force_qs, rsp->n_force_qs_ngp, - rsp->n_force_qs - rsp->n_force_qs_ngp, - READ_ONCE(rsp->n_force_qs_lh), - rsp->orphan_done.len_lazy, - rsp->orphan_done.len); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { - if (rnp->level != level) { - seq_puts(m, "\n"); - level = rnp->level; - } - seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, - ".G"[rnp->gp_tasks != NULL], - ".E"[rnp->exp_tasks != NULL], - ".T"[!list_empty(&rnp->blkd_tasks)], - rnp->grplo, rnp->grphi, rnp->grpnum); - } - seq_puts(m, "\n"); -} - -static int show_rcuhier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - print_one_rcu_state(m, rsp); - return 0; -} - -static int rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuhier, inode->i_private); -} - -static const struct file_operations rcuhier_fops = { - .owner = THIS_MODULE, - .open = rcuhier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long completed; - unsigned long gpnum; - unsigned long gpage; - unsigned long gpmax; - struct rcu_node *rnp = &rsp->node[0]; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - completed = READ_ONCE(rsp->completed); - gpnum = READ_ONCE(rsp->gpnum); - if (completed == gpnum) - gpage = 0; - else - gpage = jiffies - rsp->gp_start; - gpmax = rsp->gp_max; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", - ulong2long(completed), ulong2long(gpnum), gpage, gpmax); -} - -static int show_rcugp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - show_one_rcugp(m, rsp); - return 0; -} - -static int rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcugp, inode->i_private); -} - -static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .open = rcugp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cnp=%ld ", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending); - seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_core_needs_qs, - rdp->n_rp_report_qs, - rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", - rdp->n_rp_gp_completed, - rdp->n_rp_gp_started, - rdp->n_rp_nocb_defer_wakeup, - rdp->n_rp_need_nothing); -} - -static int show_rcu_pending(struct seq_file *m, void *v) -{ - print_one_rcu_pending(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcu_pending_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcu_pending, -}; - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcu_pending_op); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcutorture(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcutorture test sequence: %lu %s\n", - rcutorture_testseq >> 1, - (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); - seq_printf(m, "rcutorture update version number: %lu\n", - rcutorture_vernum); - return 0; -} - -static int rcutorture_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcutorture, NULL); -} - -static const struct file_operations rcutorture_fops = { - .owner = THIS_MODULE, - .open = rcutorture_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutree_trace_init(void) -{ - struct rcu_state *rsp; - struct dentry *retval; - struct dentry *rspdir; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - - for_each_rcu_flavor(rsp) { - rspdir = debugfs_create_dir(rsp->name, rcudir); - if (!rspdir) - goto free_out; - - retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuexp", 0444, - rspdir, rsp, &rcuexp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &rcu_pending_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcubarrier", 0444, - rspdir, rsp, &rcubarrier_fops); - if (!retval) - goto free_out; - -#ifdef CONFIG_RCU_BOOST - if (rsp == &rcu_preempt_state) { - retval = debugfs_create_file("rcuboost", 0444, - rspdir, NULL, &rcu_node_boost_fops); - if (!retval) - goto free_out; - } -#endif - - retval = debugfs_create_file("rcugp", 0444, - rspdir, rsp, &rcugp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuhier", 0444, - rspdir, rsp, &rcuhier_fops); - if (!retval) - goto free_out; - } - - retval = debugfs_create_file("rcutorture", 0444, rcudir, - NULL, &rcutorture_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutree_trace_init); -- cgit v1.2.3 From 44c65ff2e3b0b48250a970183ab53b0602c25764 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 May 2017 16:26:34 -0700 Subject: rcu: Eliminate NOCBs CPU-state Kconfig options The CONFIG_RCU_NOCB_CPU_ALL, CONFIG_RCU_NOCB_CPU_NONE, and CONFIG_RCU_NOCB_CPU_ZERO Kconfig options are used only in testing and are redundant with the rcu_nocbs= boot parameter. This commit therefore removes these three Kconfig options and adjusts the rcutorture scripts to use the boot parameter instead. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 4 +--- kernel/rcu/tree_plugin.h | 27 +++------------------------ 2 files changed, 4 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index d06c42deee0b..808b8c85f626 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -564,9 +564,7 @@ void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); #endif /* #else #ifdef CONFIG_TINY_RCU */ -#if defined(CONFIG_RCU_NOCB_CPU_ALL) -static inline bool rcu_is_nocb_cpu(int cpu) { return true; } -#elif defined(CONFIG_RCU_NOCB_CPU) +#ifdef CONFIG_RCU_NOCB_CPU bool rcu_is_nocb_cpu(int cpu); #else static inline bool rcu_is_nocb_cpu(int cpu) { return false; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 43f2f8026b4a..908b309d60d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1296,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) - ? 0 : rcu_cpu_has_callbacks(NULL); + return rcu_cpu_has_callbacks(NULL); } /* @@ -1409,10 +1408,6 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) unsigned long dj; RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { - *nextevt = KTIME_MAX; - return 0; - } /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1462,8 +1457,7 @@ static void rcu_prepare_for_idle(void) int tne; RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1518,8 +1512,7 @@ static void rcu_prepare_for_idle(void) static void rcu_cleanup_after_idle(void) { RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1786,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -1794,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Kick the leader kthread for this NOCB group. @@ -2253,10 +2244,6 @@ void __init rcu_init_nohz(void) bool need_rcu_nocb_mask = true; struct rcu_state *rsp; -#ifdef CONFIG_RCU_NOCB_CPU_NONE - need_rcu_nocb_mask = false; -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) need_rcu_nocb_mask = true; @@ -2272,14 +2259,6 @@ void __init rcu_init_nohz(void) if (!have_rcu_nocb_mask) return; -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -- cgit v1.2.3 From 0af92d46098a092aa5817dfeb6d24a8d85b66205 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 May 2017 08:43:40 -0700 Subject: rcu: Move RCU non-debug Kconfig options to kernel/rcu RCU's Kconfig options are scattered, and there are enough of them that it would be good for them to be more centralized. This commit therefore extracts RCU's Kconfig options from init/Kconfig into a new kernel/rcu/Kconfig file. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 242 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 kernel/rcu/Kconfig (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig new file mode 100644 index 000000000000..8edff43e8e94 --- /dev/null +++ b/kernel/rcu/Kconfig @@ -0,0 +1,242 @@ +# +# RCU-related configuration options +# + +menu "RCU Subsystem" + +config TREE_RCU + bool + default y if !PREEMPT && SMP + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. It also scales down nicely to + smaller systems. + +config PREEMPT_RCU + bool + default y if PREEMPT + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. It also scales down nicely to + smaller systems. + + Select this option if you are unsure. + +config TINY_RCU + bool + default y if !PREEMPT && !SMP + help + This option selects the RCU implementation that is + designed for UP systems from which real-time response + is not required. This option greatly reduces the + memory footprint of RCU. + +config RCU_EXPERT + bool "Make expert-level adjustments to RCU configuration" + default n + help + This option needs to be enabled if you wish to make + expert-level adjustments to RCU configuration. By default, + no such adjustments can be made, which has the often-beneficial + side-effect of preventing "make oldconfig" from asking you all + sorts of detailed questions about how you would like numerous + obscure RCU options to be set up. + + Say Y if you need to make expert-level adjustments to RCU. + + Say N if you are unsure. + +config SRCU + bool + help + This option selects the sleepable version of RCU. This version + permits arbitrary sleeping or blocking within RCU read-side critical + sections. + +config TINY_SRCU + bool + default y if SRCU && TINY_RCU + help + This option selects the single-CPU non-preemptible version of SRCU. + +config TREE_SRCU + bool + default y if SRCU && !TINY_RCU + help + This option selects the full-fledged version of SRCU. + +config TASKS_RCU + bool + default n + select SRCU + help + This option enables a task-based RCU implementation that uses + only voluntary context switch (not preemption!), idle, and + user-mode execution as quiescent states. + +config RCU_STALL_COMMON + def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE ) + help + This option enables RCU CPU stall code that is common between + the TINY and TREE variants of RCU. The purpose is to allow + the tiny variants to disable RCU CPU stall warnings, while + making these warnings mandatory for the tree variants. + +config RCU_NEED_SEGCBLIST + def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_LEAF + int "Tree-based hierarchical RCU leaf-level fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 16 + help + This option controls the leaf-level fanout of hierarchical + implementations of RCU, and allows trading off cache misses + against lock contention. Systems that synchronize their + scheduling-clock interrupts for energy-efficiency reasons will + want the default because the smaller leaf-level fanout keeps + lock contention levels acceptably low. Very large systems + (hundreds or thousands of CPUs) will instead want to set this + value to the maximum value possible in order to reduce the + number of cache misses incurred during RCU's grace-period + initialization. These systems tend to run CPU-bound, and thus + are not helped by synchronized interrupts, and thus tend to + skew them, which reduces lock contention enough that large + leaf-level fanouts work well. That said, setting leaf-level + fanout to a large number will likely cause problematic + lock contention on the leaf-level rcu_node structures unless + you boot with the skew_tick kernel parameter. + + Select a specific number if testing RCU itself. + + Select the maximum permissible value for large systems, but + please understand that you may also need to set the skew_tick + kernel boot parameter to avoid contention on the rcu_node + structure's locks. + + Take the default if unsure. + +config RCU_FAST_NO_HZ + bool "Accelerate last non-dyntick-idle CPU's grace periods" + depends on NO_HZ_COMMON && SMP && RCU_EXPERT + default n + help + This option permits CPUs to enter dynticks-idle state even if + they have RCU callbacks queued, and prevents RCU from waking + these CPUs up more than roughly once every four jiffies (by + default, you can adjust this using the rcutree.rcu_idle_gp_delay + parameter), thus improving energy efficiency. On the other + hand, this option increases the duration of RCU grace periods, + for example, slowing down synchronize_rcu(). + + Say Y if energy efficiency is critically important, and you + don't care about increased grace-period durations. + + Say N if you are unsure. + +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +config RCU_NOCB_CPU + bool "Offload RCU callback processing from boot-selected CPUs" + depends on TREE_RCU || PREEMPT_RCU + depends on RCU_EXPERT || NO_HZ_FULL + default n + help + Use this option to reduce OS jitter for aggressive HPC or + real-time workloads. It can also be used to offload RCU + callback invocation to energy-efficient CPUs in battery-powered + asymmetric multiprocessors. + + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and + "s" for RCU-sched. Nothing prevents this kthread from running + on the specified CPUs, but (1) the kthreads may be preempted + between each callback, and (2) affinity or cgroups can be used + to force the kthreads to run on whatever set of CPUs is desired. + + Say Y here if you want to help to debug reduced OS jitter. + Say N here if you are unsure. + +endmenu # "RCU Subsystem" -- cgit v1.2.3 From 43a0a2a7d725f2ed2547cd656749eb66c093f2c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 May 2017 09:19:44 -0700 Subject: rcu: Move RCU debug Kconfig options to kernel/rcu RCU's debugging Kconfig options are in the unintuitive location lib/Kconfig.debug, and there are enough of them that it would be good for them to be more centralized. This commit therefore extracts RCU's Kconfig options from init/Kconfig into a new kernel/rcu/Kconfig.debug file. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 kernel/rcu/Kconfig.debug (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug new file mode 100644 index 000000000000..0ec7d1d33a14 --- /dev/null +++ b/kernel/rcu/Kconfig.debug @@ -0,0 +1,82 @@ +# +# RCU-related debugging configuration options +# + +menu "RCU Debugging" + +config PROVE_RCU + def_bool PROVE_LOCKING + +config TORTURE_TEST + tristate + default n + +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_TORTURE_TEST + tristate "torture tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs torture tests + on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU torture tests to be built into + the kernel. + Say M if you want the RCU torture tests to build as a module. + Say N if you are unsure. + +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_STALL_COMMON + range 3 300 + default 21 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_TRACE + bool "Enable tracing for RCU" + depends on DEBUG_KERNEL + default y if TREE_RCU + select TRACE_CLOCK + help + This option enables additional tracepoints for ftrace-style + event tracing. + + Say Y here if you want to enable RCU tracing + Say N if you are unsure. + +config RCU_EQS_DEBUG + bool "Provide debugging asserts for adding NO_HZ support to an arch" + depends on DEBUG_KERNEL + help + This option provides consistency checks in RCU's handling of + NO_HZ. These checks have proven quite helpful in detecting + bugs in arch-specific NO_HZ code. + + Say N here if you need ultimate kernel/user switch latencies + Say Y if you are unsure + +endmenu # "RCU Debugging" -- cgit v1.2.3 From c23484f0e7bc89e1facb04103ce24efeebee76b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 May 2017 10:17:03 -0700 Subject: rcu: Remove event tracing from Tiny RCU This commit saves a few lines by getting rid of Tiny RCU's event tracing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 2306cab2195d..595cb1bf944f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -35,7 +35,6 @@ #include #include #include -#include #include "rcu.h" @@ -139,7 +138,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -161,10 +159,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++;) } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) - RCU_TRACE(trace_rcu_batch_end(rcp->name, - cb_count, 0, need_resched(), - is_idle_task(current), - false)); } static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -- cgit v1.2.3 From 6d48152eafde1f0d0a4a9e0584fa7d9ff4fbfdac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 May 2017 10:54:29 -0700 Subject: rcu: Remove RCU CPU stall warnings from Tiny RCU Tiny RCU's job is to be tiny, so this commit removes its RCU CPU stall warning code. After this, there is no longer any need for rcu_sched_ctrlblk and rcu_bh_ctrlblk to be in tiny_plugin.h, so this commit also moves them to tiny.c. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 2 +- kernel/rcu/tiny.c | 35 +++++++++++----------- kernel/rcu/tiny_plugin.h | 78 ------------------------------------------------ 3 files changed, 19 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 8edff43e8e94..be90c945063f 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -78,7 +78,7 @@ config TASKS_RCU user-mode execution as quiescent states. config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE ) + def_bool ( TREE_RCU || PREEMPT_RCU ) help This option enables RCU CPU stall code that is common between the TINY and TREE variants of RCU. The purpose is to allow diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 595cb1bf944f..f8488965250f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -38,11 +38,23 @@ #include "rcu.h" -/* Forward declarations for tiny_plugin.h. */ -struct rcu_ctrlblk; -static void __call_rcu(struct rcu_head *head, - rcu_callback_t func, - struct rcu_ctrlblk *rcp); +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; #include "tiny_plugin.h" @@ -65,7 +77,6 @@ EXPORT_SYMBOL(rcu_barrier_sched); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp);) if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -111,7 +122,6 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls();) if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -126,10 +136,8 @@ void rcu_check_callbacks(int user) */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { - const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0;) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -147,18 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name;) while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim(rn, list); + __rcu_reclaim("", list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++;) } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) } static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) @@ -202,7 +207,6 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++;) local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -235,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) - rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c642f23f1582..f0a01b2a3062 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -22,34 +22,6 @@ * Author: Paul E. McKenney */ -#include -#include - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ - struct rcu_head **curtail; /* ->next pointer of last CB. */ - RCU_TRACE(long qlen); /* Number of pending CBs. */ - RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ - RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ - RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ - RCU_TRACE(const char *name); /* Name of RCU type. */ -}; - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_sched") -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_bh") -}; - #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) #include @@ -73,53 +45,3 @@ void __init rcu_scheduler_starting(void) } #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ - -#ifdef CONFIG_RCU_TRACE - -static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) -{ - unsigned long flags; - - local_irq_save(flags); - rcp->qlen -= n; - local_irq_restore(flags); -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = READ_ONCE(rcp->jiffies_stall); - if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - WRITE_ONCE(rcp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - } else if (ULONG_CMP_GE(j, js)) { - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); - } -} - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ -- cgit v1.2.3 From 67e707bd68269aac70904943f07a979eeb163b13 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Thu, 8 Jun 2017 18:09:09 +0530 Subject: config: android-recommended: enable fstack-protector-strong If compiler has stack protector support, set CONFIG_CC_STACKPROTECTOR_STRONG. Reviewed-at: https://android-review.googlesource.com/#/c/238388/ Signed-off-by: Jeff Vander Stoep [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 28ee064b6744..a86faa41bfd2 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -11,6 +11,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_STRICT_KERNEL_RWX=y CONFIG_DM_CRYPT=y -- cgit v1.2.3 From 0c9238c7a1cfd834d8bb96a2b1fabe0b1a5961df Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Jun 2017 18:09:10 +0530 Subject: config: android-recommended: enable CONFIG_ARM64_SW_TTBR0_PAN Enable PAN emulation using TTBR0_EL1 switching. Reviewed-at: https://android-review.googlesource.com/#/c/325997/ Signed-off-by: Sami Tolvanen [AmitP: cherry-picked this change from Android common kernel and updated the commit message] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index a86faa41bfd2..a02c447769f7 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -6,6 +6,7 @@ # CONFIG_NF_CONNTRACK_SIP is not set # CONFIG_PM_WAKELOCKS_GC is not set # CONFIG_VT is not set +CONFIG_ARM64_SW_TTBR0_PAN=y CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y -- cgit v1.2.3 From c1ebc2febdb85a73a4f91a9b9eaab6387619eaa6 Mon Sep 17 00:00:00 2001 From: Max Shi Date: Thu, 8 Jun 2017 18:09:11 +0530 Subject: config: android-base: disable CONFIG_USELIB and CONFIG_FHANDLE Turn off the two kernel configs to disable related system ABI. Reviewed-at: https://android-review.googlesource.com/#/c/264976/ Signed-off-by: Max Shi [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 26a06e09a5bd..efe5ff86767e 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -1,10 +1,12 @@ # KEEP ALPHABETICALLY SORTED # CONFIG_DEVKMEM is not set # CONFIG_DEVMEM is not set +# CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set # CONFIG_MODULES is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set +# CONFIG_USELIB is not set CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_LOW_MEMORY_KILLER=y -- cgit v1.2.3 From fb0b1538983c1cf7d2a2242b332a34a953753624 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Jun 2017 18:09:12 +0530 Subject: config: android-recommended: enable CONFIG_CPU_SW_DOMAIN_PAN Enable CPU domain PAN to ensure that normal kernel accesses are unable to access userspace addresses. Reviewed-at: https://android-review.googlesource.com/#/c/334035/ Signed-off-by: Sami Tolvanen [AmitP: cherry-picked this change from Android common kernel, updated the commit message and re-placed the CONFIG_STRICT_KERNEL_RWX config in sorted order] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index a02c447769f7..946fb92418f7 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -14,7 +14,7 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y -CONFIG_STRICT_KERNEL_RWX=y +CONFIG_CPU_SW_DOMAIN_PAN=y CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y @@ -107,6 +107,7 @@ CONFIG_SCHEDSTATS=y CONFIG_SMARTJOYPLUS_FF=y CONFIG_SND=y CONFIG_SOUND=y +CONFIG_STRICT_KERNEL_RWX=y CONFIG_SUSPEND_TIME=y CONFIG_TABLET_USB_ACECAD=y CONFIG_TABLET_USB_AIPTEK=y -- cgit v1.2.3 From 5b89db2fa545b473dc352689ac3afe407367ea34 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 8 Jun 2017 18:09:13 +0530 Subject: config: android-base: add CONFIG_IKCONFIG option This adds CONFIG_IKCONFIG and CONFIG_IKCONFIG_PROC options, which are a requirement for the O release. Reviewed-at: https://android-review.googlesource.com/#/c/364553/ Signed-off-by: Greg Kroah-Hartman [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index efe5ff86767e..e12cfec25758 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -25,6 +25,8 @@ CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HARDENED_USERCOPY=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y CONFIG_INET6_IPCOMP=y -- cgit v1.2.3 From 2096e1706336d83cd66ca744e4d904af4d63e25c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 8 Jun 2017 18:09:14 +0530 Subject: config: android-base: add CONFIG_MODULES option This adds CONFIG_MODULES, CONFIG_MODULE_UNLOAD, and CONFIG_MODVERSIONS which are required by the O release. Reviewed-at: https://android-review.googlesource.com/#/c/364554/ Signed-off-by: Greg Kroah-Hartman [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index e12cfec25758..62cb392fc34b 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -3,7 +3,6 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set -# CONFIG_MODULES is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set @@ -64,6 +63,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_TARGET_NETMAP=y CONFIG_IP_NF_TARGET_REDIRECT=y CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y CONFIG_NET=y CONFIG_NETDEVICES=y CONFIG_NETFILTER=y -- cgit v1.2.3 From 2edfe6be206adc4c1055e053322d27267f8952bc Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 8 Jun 2017 18:09:15 +0530 Subject: config: android-base: add CGROUP_BPF Add CONFIG_CGROUP_BPF as a default configuration in android base config since it is used to replace XT_QTAGUID in future. Reviewed-at: https://android-review.googlesource.com/#/c/400374/ Signed-off-by: Chenbo Feng [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 62cb392fc34b..cdde5af6b332 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -14,6 +14,7 @@ CONFIG_ASHMEM=y CONFIG_AUDIT=y CONFIG_BLK_DEV_INITRD=y CONFIG_CGROUPS=y +CONFIG_CGROUP_BPF=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y -- cgit v1.2.3 From 9e69dd0179c346dfb5d08b8d46d5f5c9c81ab1b7 Mon Sep 17 00:00:00 2001 From: Roberto Pereira Date: Thu, 8 Jun 2017 18:09:16 +0530 Subject: config: android-base: disable CONFIG_NFSD and CONFIG_NFS_FS Disable Network file system support. Reviewed-at: https://android-review.googlesource.com/#/c/409559/ Signed-off-by: Roberto Pereira [AmitP: cherry-picked this change from Android common kernel and updated commit message] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index cdde5af6b332..d70829033bb7 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -3,6 +3,8 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set +# CONFIG_NFSD is not set +# CONFIG_NFS_FS is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set -- cgit v1.2.3 From 4e4cbee93d56137ebff722be022cae5f70ef84fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 09:38:06 +0200 Subject: block: switch bios to blk_status_t Replace bi_error with a new bi_status to allow for a clear conversion. Note that device mapper overloaded bi_error with a private value, which we'll have to keep arround at least for now and thus propagate to a proper blk_status_t value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/power/swap.c | 14 +++++++------- kernel/trace/blktrace.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f80fd33639e0..57d22571f306 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev; struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; - int error; + blk_status_t error; }; static void hib_init_batch(struct hib_bio_batch *hb) { atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); - hb->error = 0; + hb->error = BLK_STS_OK; } static void hib_end_io(struct bio *bio) @@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio) struct hib_bio_batch *hb = bio->bi_private; struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), @@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio) flush_icache_range((unsigned long)page_address(page), (unsigned long)page_address(page) + PAGE_SIZE); - if (bio->bi_error && !hb->error) - hb->error = bio->bi_error; + if (bio->bi_status && !hb->error) + hb->error = bio->bi_status; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); @@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, return error; } -static int hib_wait_io(struct hib_bio_batch *hb) +static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { wait_event(hb->wait, atomic_read(&hb->count) == 0); - return hb->error; + return blk_status_to_errno(hb->error); } /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 193c5f5e3f79..bc364f86100a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu); } } @@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r); } -- cgit v1.2.3 From 1e1fc133483ef3b56c20bf3cd9241146c41042f8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 May 2017 00:29:38 -0400 Subject: compat_{get,put}_bitmap(): use unsafe_{get,put}_user() unroll the inner loops, while we are at it Signed-off-by: Al Viro --- kernel/compat.c | 81 ++++++++++++++++++++------------------------------------- 1 file changed, 28 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 860f674fa556..9c2a8f3788d5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -871,84 +871,59 @@ int get_compat_sigevent(struct sigevent *event, long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size) { - int i, j; - unsigned long m; - compat_ulong_t um; unsigned long nr_compat_longs; /* align bitmap up to nearest compat_long_t boundary */ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) return -EFAULT; - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = 0; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - /* - * We dont want to read past the end of the userspace - * bitmap. We must however ensure the end of the - * kernel bitmap is zeroed. - */ - if (nr_compat_longs) { - nr_compat_longs--; - if (__get_user(um, umask)) - return -EFAULT; - } else { - um = 0; - } - - umask++; - m |= (long)um << (j * BITS_PER_COMPAT_LONG); - } - *mask++ = m; + user_access_begin(); + while (nr_compat_longs > 1) { + compat_ulong_t l1, l2; + unsafe_get_user(l1, umask++, Efault); + unsafe_get_user(l2, umask++, Efault); + *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1; + nr_compat_longs -= 2; } - + if (nr_compat_longs) + unsafe_get_user(*mask, umask++, Efault); + user_access_end(); return 0; + +Efault: + user_access_end(); + return -EFAULT; } long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size) { - int i, j; - unsigned long m; - compat_ulong_t um; unsigned long nr_compat_longs; /* align bitmap up to nearest compat_long_t boundary */ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) return -EFAULT; - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = *mask++; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - um = m; - - /* - * We dont want to write past the end of the userspace - * bitmap. - */ - if (nr_compat_longs) { - nr_compat_longs--; - if (__put_user(um, umask)) - return -EFAULT; - } - - umask++; - m >>= 4*sizeof(um); - m >>= 4*sizeof(um); - } + user_access_begin(); + while (nr_compat_longs > 1) { + unsigned long m = *mask++; + unsafe_put_user((compat_ulong_t)m, umask++, Efault); + unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault); + nr_compat_longs -= 2; } - + if (nr_compat_longs) + unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); + user_access_end(); return 0; +Efault: + user_access_end(); + return -EFAULT; } void -- cgit v1.2.3 From ca2406ed58fef3f7c8ef6470cba807bfc3415605 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:22:44 -0400 Subject: times(2): move compat to native Signed-off-by: Al Viro --- kernel/compat.c | 24 ------------------------ kernel/sys.c | 28 +++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 9c2a8f3788d5..99408252762e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -350,30 +350,6 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, return 0; } -static compat_clock_t clock_t_to_compat_clock_t(clock_t x) -{ - return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); -} - -COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) -{ - if (tbuf) { - struct tms tms; - struct compat_tms tmp; - - do_sys_times(&tms); - /* Convert our struct tms to the compat version. */ - tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); - tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); - tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); - tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); - if (copy_to_user(tbuf, &tmp, sizeof(tmp))) - return -EFAULT; - } - force_successful_syscall_return(); - return compat_jiffies_to_clock_t(jiffies); -} - #ifdef __ARCH_WANT_SYS_SIGPENDING /* diff --git a/kernel/sys.c b/kernel/sys.c index 3778a8a417b6..161b5eae9c77 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid) return from_kgid_munged(current_user_ns(), current_egid()); } -void do_sys_times(struct tms *tms) +static void do_sys_times(struct tms *tms) { u64 tgutime, tgstime, cutime, cstime; @@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) return (long) jiffies_64_to_clock_t(get_jiffies_64()); } +#ifdef CONFIG_COMPAT +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + +COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) +{ + if (tbuf) { + struct tms tms; + struct compat_tms tmp; + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); + if (copy_to_user(tbuf, &tmp, sizeof(tmp))) + return -EFAULT; + } + force_successful_syscall_return(); + return compat_jiffies_to_clock_t(jiffies); +} +#endif + /* * This needs some heavy checking ... * I just haven't the stomach for it. I also don't fully -- cgit v1.2.3 From d9e968cb9f849770288f5fde3d8d3a5f7e339052 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:33:51 -0400 Subject: getrlimit()/setrlimit(): move compat to native Signed-off-by: Al Viro --- kernel/compat.c | 38 -------------------------------------- kernel/sys.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 99408252762e..58b8e57398d1 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -427,44 +427,6 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, #endif -COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - - if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || - __get_user(r.rlim_cur, &rlim->rlim_cur) || - __get_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - - if (r.rlim_cur == COMPAT_RLIM_INFINITY) - r.rlim_cur = RLIM_INFINITY; - if (r.rlim_max == COMPAT_RLIM_INFINITY) - r.rlim_max = RLIM_INFINITY; - return do_prlimit(current, resource, &r, NULL); -} - -COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - int ret; - - ret = do_prlimit(current, resource, NULL, &r); - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) { if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || diff --git a/kernel/sys.c b/kernel/sys.c index 161b5eae9c77..873e6eaa314f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1332,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) return ret; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + struct compat_rlimit r32; + + if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) + return -EFAULT; + + if (r32.rlim_cur == COMPAT_RLIM_INFINITY) + r.rlim_cur = RLIM_INFINITY; + else + r.rlim_cur = r32.rlim_cur; + if (r32.rlim_max == COMPAT_RLIM_INFINITY) + r.rlim_max = RLIM_INFINITY; + else + r.rlim_max = r32.rlim_max; + return do_prlimit(current, resource, &r, NULL); +} + +COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + int ret; + + ret = do_prlimit(current, resource, NULL, &r); + if (!ret) { + struct rlimit r32; + if (r.rlim_cur > COMPAT_RLIM_INFINITY) + r32.rlim_cur = COMPAT_RLIM_INFINITY; + else + r32.rlim_cur = r.rlim_cur; + if (r.rlim_max > COMPAT_RLIM_INFINITY) + r32.rlim_max = COMPAT_RLIM_INFINITY; + else + r32.rlim_max = r.rlim_max; + + if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) + return -EFAULT; + } + return ret; +} + +#endif + #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT /* -- cgit v1.2.3 From 8f13621abcedb278cfecf9703583743f9c474c97 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:42:07 -0400 Subject: sigpending(): move compat to native ... and kill set_fs() use Signed-off-by: Al Viro --- kernel/compat.c | 23 ----------------------- kernel/signal.c | 12 ++++++++++++ 2 files changed, 12 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 58b8e57398d1..195e23469854 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -350,29 +350,6 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, return 0; } -#ifdef __ARCH_WANT_SYS_SIGPENDING - -/* - * Assumption: old_sigset_t and compat_old_sigset_t are both - * types that can be passed to put_user()/get_user(). - */ - -COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set) -{ - old_sigset_t s; - long ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sigpending((old_sigset_t __user *) &s); - set_fs(old_fs); - if (ret == 0) - ret = put_user(s, set); - return ret; -} - -#endif - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/signal.c b/kernel/signal.c index d1eed0d7ca64..6237f492adfc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3254,6 +3254,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) +{ + sigset_t set; + int err = do_sigpending(&set, sizeof(old_sigset_t)); + if (err == 0) + if (copy_to_user(set32, &set, sizeof(old_sigset_t))) + err = -EFAULT; + return err; +} +#endif + #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -- cgit v1.2.3 From 7668b679c3b7931f6436d1eef50904831209e749 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 06:39:31 -0400 Subject: put_compat_rusage(): switch to copy_to_user() Signed-off-by: Al Viro --- kernel/compat.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 195e23469854..64e772aabdb5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -406,25 +406,27 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) { - if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || - __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || - __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || - __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || - __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || - __put_user(r->ru_maxrss, &ru->ru_maxrss) || - __put_user(r->ru_ixrss, &ru->ru_ixrss) || - __put_user(r->ru_idrss, &ru->ru_idrss) || - __put_user(r->ru_isrss, &ru->ru_isrss) || - __put_user(r->ru_minflt, &ru->ru_minflt) || - __put_user(r->ru_majflt, &ru->ru_majflt) || - __put_user(r->ru_nswap, &ru->ru_nswap) || - __put_user(r->ru_inblock, &ru->ru_inblock) || - __put_user(r->ru_oublock, &ru->ru_oublock) || - __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || - __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || - __put_user(r->ru_nsignals, &ru->ru_nsignals) || - __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || - __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) + struct compat_rusage r32; + memset(&r32, 0, sizeof(r32)); + r32.ru_utime.tv_sec = r->ru_utime.tv_sec; + r32.ru_utime.tv_usec = r->ru_utime.tv_usec; + r32.ru_stime.tv_sec = r->ru_stime.tv_sec; + r32.ru_stime.tv_usec = r->ru_stime.tv_usec; + r32.ru_maxrss = r->ru_maxrss; + r32.ru_ixrss = r->ru_ixrss; + r32.ru_idrss = r->ru_idrss; + r32.ru_isrss = r->ru_isrss; + r32.ru_minflt = r->ru_minflt; + r32.ru_majflt = r->ru_majflt; + r32.ru_nswap = r->ru_nswap; + r32.ru_inblock = r->ru_inblock; + r32.ru_oublock = r->ru_oublock; + r32.ru_msgsnd = r->ru_msgsnd; + r32.ru_msgrcv = r->ru_msgrcv; + r32.ru_nsignals = r->ru_nsignals; + r32.ru_nvcsw = r->ru_nvcsw; + r32.ru_nivcsw = r->ru_nivcsw; + if (copy_to_user(ru, &r32, sizeof(r32))) return -EFAULT; return 0; } -- cgit v1.2.3 From 1b3c872c8342803d0fcd8042e4e007d173191db6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:46:17 -0400 Subject: rt_sigtimedwait(): move compat to native Signed-off-by: Al Viro --- kernel/compat.c | 32 -------------------------------- kernel/signal.c | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 64e772aabdb5..36e6e7c405e3 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -866,38 +866,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) } } -COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, - struct compat_siginfo __user *, uinfo, - struct compat_timespec __user *, uts, compat_size_t, sigsetsize) -{ - compat_sigset_t s32; - sigset_t s; - struct timespec t; - siginfo_t info; - long ret; - - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&s, &s32); - - if (uts) { - if (compat_get_timespec(&t, uts)) - return -EFAULT; - } - - ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); - - if (ret > 0 && uinfo) { - if (copy_siginfo_to_user32(uinfo, &info)) - ret = -EFAULT; - } - - return ret; -} - #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/signal.c b/kernel/signal.c index 6237f492adfc..fe5b3608c31c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2768,7 +2768,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * @info: if non-null, the signal's siginfo is returned here * @ts: upper bound on process time suspension */ -int do_sigtimedwait(const sigset_t *which, siginfo_t *info, +static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, const struct timespec *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; @@ -2857,6 +2857,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, + struct compat_siginfo __user *, uinfo, + struct compat_timespec __user *, uts, compat_size_t, sigsetsize) +{ + compat_sigset_t s32; + sigset_t s; + struct timespec t; + siginfo_t info; + long ret; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&s, &s32); + + if (uts) { + if (compat_get_timespec(&t, uts)) + return -EFAULT; + } + + ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); + + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; + } + + return ret; +} +#endif + /** * sys_kill - send a signal to a process * @pid: the PID of the process -- cgit v1.2.3 From 20b9d7ac48526ce9a14106241e76e8382d126a60 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:40 +0200 Subject: bpf: avoid excessive stack usage for perf_sample_data perf_sample_data consumes 386 bytes on stack, reduce excessive stack usage and move it to per cpu buffer. It's allowed due to preemption being disabled for tracing, xdp and tc programs, thus at all times only one program can run on a specific CPU and programs cannot run from interrupt. We similarly also handle bpf_pt_regs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08eb072430b9..051d7fca0c09 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -266,14 +266,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw) { struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - struct perf_sample_data sample_data; struct bpf_event_entry *ee; struct perf_event *event; @@ -294,9 +296,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = raw; - perf_event_output(event, &sample_data, regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = raw; + perf_event_output(event, sd, regs); return 0; } -- cgit v1.2.3 From d25da6caa2a1d6644360c40d7c5fd7c057551360 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:41 +0200 Subject: bpf: don't check spilled reg state for non-STACK_SPILLed type slots spilled_regs[] state is only used for stack slots of type STACK_SPILL, never for STACK_MISC. Right now, in states_equal(), even if we have old and current stack state of type STACK_MISC, we compare spilled_regs[] for that particular offset. Just skip these like we do everywhere else. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ccb0759fa4..d031b3b0752e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2828,6 +2828,8 @@ static bool states_equal(struct bpf_verifier_env *env, return false; if (i % BPF_REG_SIZE) continue; + if (old->stack_slot_type[i] != STACK_SPILL) + continue; if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], &cur->spilled_regs[i / BPF_REG_SIZE], sizeof(old->spilled_regs[0]))) -- cgit v1.2.3 From 4a2ff55aa4946b036b87572976cbfc6ab244c497 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:42 +0200 Subject: bpf: reset id on CONST_IMM transition Whenever we set the register to the type CONST_IMM, we currently don't reset the id to 0. id member is not used in CONST_IMM case, so don't let it become stale, where pruning won't be able to match later on. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d031b3b0752e..d195d825515a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1952,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].id = 0; regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); @@ -2409,6 +2410,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; + regs[insn->dst_reg].id = 0; return 0; } -- cgit v1.2.3 From 36e24c003091a11ec847291c9a1d36d2ec92b155 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:43 +0200 Subject: bpf: reset id on spilled regs in clear_all_pkt_pointers Right now, we don't reset the id of spilled registers in case of clear_all_pkt_pointers(). Given pkt_pointers are highly likely to contain an id, do so by reusing __mark_reg_unknown_value(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d195d825515a..519a6144d3d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1346,8 +1346,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_PACKET_END) continue; - reg->type = UNKNOWN_VALUE; - reg->imm = 0; + __mark_reg_unknown_value(state->spilled_regs, + i / BPF_REG_SIZE); } } -- cgit v1.2.3 From f67abed585efe251edda52dc9690020d6441890f Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Fri, 9 Jun 2017 10:00:29 +0200 Subject: sched/fair: Fix typo in printk message 'schedstats' kernel parameter should be set to enable/disable, so correct the printk hint saying that it should be set to 'enable' rather than 'enabled' to enable scheduler tracepoints. Signed-off-by: Marcin Nowakowski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1496995229-31245-1-git-send-email-marcin.nowakowski@imgtec.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d71109321841..c77e4b1d51c0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3563,7 +3563,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_runtime_enabled()) { printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enabled or " + "kernel parameter schedstats=enable or " "kernel.sched_schedstats=1\n"); } #endif -- cgit v1.2.3 From 252d2a4117bc181b287eeddf848863788da733ae Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 9 Jun 2017 11:49:15 -0700 Subject: sched/core: Idle_task_exit() shouldn't use switch_mm_irqs_off() idle_task_exit() can be called with IRQs on x86 on and therefore should use switch_mm(), not switch_mm_irqs_off(). This doesn't seem to cause any problems right now, but it will confuse my upcoming TLB flush changes. Nonetheless, I think it should be backported because it's trivial. There won't be any meaningful performance impact because idle_task_exit() is only used when offlining a CPU. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: f98db6013c55 ("sched/core: Add switch_mm_irqs_off() and use it in the scheduler") Link: http://lkml.kernel.org/r/ca3d1a9fa93a0b49f5a8ff729eda3640fb6abdf9.1497034141.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 803c3bc274c4..326d4f88e2b1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5605,7 +5605,7 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) { - switch_mm_irqs_off(mm, &init_mm, current); + switch_mm(mm, &init_mm, current); finish_arch_post_lock_switch(); } mmdrop(mm); -- cgit v1.2.3 From ff0a6d6f932ff4b9eb8e8140f98cc1cf763d0d78 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Jun 2017 14:16:16 +0200 Subject: Revert "cpufreq: schedutil: Reduce frequencies slower" Revert commit 39b64aa1c007 (cpufreq: schedutil: Reduce frequencies slower) that introduced unintentional changes in behavior leading to adverse effects on some systems. Reported-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..8773d1efdfab 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -101,9 +101,6 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (sg_policy->next_freq == next_freq) return; - if (sg_policy->next_freq > next_freq) - next_freq = (sg_policy->next_freq + next_freq) >> 1; - sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; -- cgit v1.2.3 From c6503be587e9c5c0aac4e2b45de982352f676a5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 Jun 2017 17:21:26 +0200 Subject: posix-timers: Fix inverted SIGEV_NONE logic in common_timer_get() The refactoring of the posix-timer core to allow better code sharing introduced inverted logic vs. SIGEV_NONE timers in common_timer_get(). That causes hrtimer_forward() to be called on active timers, which rightfully triggers the warning hrtimer_forward(). Make sig_none what it says: signal mode == SIGEV_NONE. Fixes: 91d57bae0868 ("posix-timers: Make use of forward/remaining callbacks") Reported-by: Ye Xiaolong Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170609104457.GA39907@inn.lkp.intel.com --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 6e7a70b1bf37..b53a0b562516 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -644,7 +644,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE; + sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ -- cgit v1.2.3 From 94114c367553f3301747e47f6947cabde947575f Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 7 Jun 2017 23:36:03 -0700 Subject: tick/broadcast: Make tick_broadcast_setup_oneshot() static This function isn't used outside of tick-broadcast.c, so let's mark it static. Signed-off-by: Stephen Boyd Link: http://lkml.kernel.org/r/20170608063603.13276-1-sboyd@codeaurora.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 +++- kernel/time/tick-internal.h | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 987e496bb51a..b398c2ea69b2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -37,9 +37,11 @@ static int tick_broadcast_forced; static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); #ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); #else +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } #endif @@ -867,7 +869,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, /** * tick_broadcast_setup_oneshot - setup the broadcast device */ -void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { int cpu = smp_processor_id(); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f738251000fe..be0ac01f2e12 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -126,7 +126,6 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) -extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); @@ -134,7 +133,6 @@ extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } -- cgit v1.2.3 From 57de72125d34f83bfd39615fcc3cc25ca3b9c0ec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 8 Jun 2017 10:55:33 +0200 Subject: cpu/hotplug: Remove unused check_for_tasks() function clang -Wunused-function found one remaining function that was apparently meant to be removed in a recent code cleanup: kernel/cpu.c:565:20: warning: unused function 'check_for_tasks' [-Wunused-function] Sebastian explained: The function became unused unintentionally, but there is already a failure check, when a task cannot be removed from the outgoing cpu in the scheduler code, so bringing it back is not really giving any extra value. Fixes: 530e9b76ae8f ("cpu/hotplug: Remove obsolete cpu hotplug register/unregister functions") Signed-off-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Boris Ostrovsky Cc: Anna-Maria Gleixner Link: http://lkml.kernel.org/r/20170608085544.2257132-1-arnd@arndb.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 7435ffc6163b..d0f5f54aa087 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -562,30 +562,6 @@ void clear_tasks_mm_cpumask(int cpu) rcu_read_unlock(); } -static inline void check_for_tasks(int dead_cpu) -{ - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (!p->on_rq) - continue; - /* - * We do the check with unlocked task_rq(p)->lock. - * Order the reading to do not warn about a task, - * which was running on this cpu in the past, and - * it's just been woken on another cpu. - */ - rmb(); - if (task_cpu(p) != dead_cpu) - continue; - - pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", - p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } - read_unlock(&tasklist_lock); -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { -- cgit v1.2.3 From 5c7a3a3d20a4e175304c0e23809e3d70be8fed8a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 Jun 2017 19:44:09 +0200 Subject: posix-timers: Zero out oldval itimerspec The recent posix timer rework moved the clearing of the itimerspec to the real syscall implementation, but forgot that the kclock->timer_get() is used by timer_settime() as well. That results in an uninitialized variable and bogus values returned to user space. Add the missing memset to timer_settime(). Fixes: eabdec043853 ("posix-timers: Zero settings value in common code") Reported-by: Andrei Vagin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/20170609201156.GB21491@outlook.office365.com --- kernel/time/posix-timers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b53a0b562516..88517dcfe0ca 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -828,6 +828,8 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (!timespec64_valid(&new_spec64.it_interval) || !timespec64_valid(&new_spec64.it_value)) return -EINVAL; + if (rtn) + memset(rtn, 0, sizeof(*rtn)); retry: timr = lock_timer(timer_id, &flag); if (!timr) -- cgit v1.2.3 From 67edab48caeb75d412706f4b9d3107afd1e07623 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 Jun 2017 19:39:49 +0200 Subject: posix-timers: Handle relative posix-timers correctly The recent rework of the posix timer internals broke the magic posix mechanism, which requires that relative timers are not affected by modifications of the underlying clock. That means relative CLOCK_REALTIME timers cannot use CLOCK_REALTIME, because that can be set and adjusted. The underlying hrtimer switches the clock for these timers to CLOCK_MONOTONIC. That still works, but reading the remaining time of such a timer has been broken in the rework. The old code used the hrtimer internals directly and avoided the posix clock callbacks. Now common_timer_get() uses the underlying kclock->timer_get() callback, which is still CLOCK_REALTIME based. So the remaining time of such a timer is calculated against the wrong time base. Handle it by switching the k_itimer->kclock pointer according to the resulting hrtimer mode. k_itimer->it_clock still contains CLOCK_REALTIME because the timer might be set with ABSTIME later and then it needs to switch back to the realtime posix clock implementation. Fixes: eae1c4ae275f ("posix-timers: Make use of cancel/arm callbacks") Reported-by: Andrei Vagin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/20170609201156.GB21491@outlook.office365.com --- kernel/time/posix-timers.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 88517dcfe0ca..58c0f60b132f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -72,6 +72,7 @@ static DEFINE_SPINLOCK(hash_lock); static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); +static const struct k_clock clock_realtime, clock_monotonic; /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other @@ -750,6 +751,18 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, enum hrtimer_mode mode; mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + /* + * Posix magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they become CLOCK_MONOTONIC based under the + * hood. See hrtimer_init(). Update timr->kclock, so the generic + * functions which use timr->kclock->clock_get() work. + * + * Note: it_clock stays unmodified, because the next timer_set() might + * use ABSTIME, so it needs to switch back. + */ + if (timr->it_clock == CLOCK_REALTIME) + timr->kclock = absolute ? &clock_realtime : &clock_monotonic; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; -- cgit v1.2.3 From e4c1a0d15b62a3ed2f8cd560e9fa589dbe880c13 Mon Sep 17 00:00:00 2001 From: Derek Robson Date: Mon, 12 Jun 2017 14:33:03 +1200 Subject: audit: style fix Fixed checkpatch.pl warnings of "function definition argument FOO should also have an identifier name" Signed-off-by: Derek Robson Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index bb3a4e14b7e5..b331d9b83f63 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -248,13 +248,13 @@ struct audit_netlink_list { struct sk_buff_head q; }; -int audit_send_list(void *); +int audit_send_list(void *_dest); extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; -extern int audit_del_rule(struct audit_entry *); -extern void audit_free_rule_rcu(struct rcu_head *); +extern int audit_del_rule(struct audit_entry *entry); +extern void audit_free_rule_rcu(struct rcu_head *head); extern struct list_head audit_filter_list[]; extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); @@ -302,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE -extern struct audit_chunk *audit_tree_lookup(const struct inode *); -extern void audit_put_chunk(struct audit_chunk *); -extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); -extern int audit_make_tree(struct audit_krule *, char *, u32); -extern int audit_add_tree_rule(struct audit_krule *); -extern int audit_remove_tree_rule(struct audit_krule *); +extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); +extern void audit_put_chunk(struct audit_chunk *chunk); +extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); +extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); +extern int audit_add_tree_rule(struct audit_krule *rule); +extern int audit_remove_tree_rule(struct audit_krule *rule); extern void audit_trim_trees(void); extern int audit_tag_tree(char *old, char *new); -extern const char *audit_tree_path(struct audit_tree *); -extern void audit_put_tree(struct audit_tree *); -extern void audit_kill_trees(struct list_head *); +extern const char *audit_tree_path(struct audit_tree *tree); +extern void audit_put_tree(struct audit_tree *tree); +extern void audit_kill_trees(struct list_head *list); #else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL @@ -324,7 +324,7 @@ extern void audit_kill_trees(struct list_head *); #define audit_kill_trees(list) BUG() #endif -extern char *audit_unpack_string(void **, size_t *, size_t); +extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); extern pid_t audit_sig_pid; extern kuid_t audit_sig_uid; @@ -334,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype); #ifdef CONFIG_AUDITSYSCALL extern int audit_signal_info(int sig, struct task_struct *t); -extern void audit_filter_inodes(struct task_struct *, struct audit_context *); +extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); #else #define audit_signal_info(s,t) AUDIT_DISABLED -- cgit v1.2.3 From fa07ab72cbb0d843429e61bf179308aed6cbe0dd Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 11 Jun 2017 00:38:36 +0200 Subject: genirq: Release resources in __setup_irq() error path In case __irq_set_trigger() fails the resources requested via irq_request_resources() are not released. Add the missing release call into the error handling path. Fixes: c1bacbae8192 ("genirq: Provide irq_request/release_resources chip callbacks") Signed-off-by: Heiner Kallweit Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/655538f5-cb20-a892-ff15-fbd2dd1fa4ec@gmail.com --- kernel/irq/manage.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 070be980c37a..425170d4439b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1312,8 +1312,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) + if (ret) { + irq_release_resources(desc); goto out_mask; + } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ -- cgit v1.2.3 From d4af6d933ccffd24286528f04d5c39e702c8580f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 13 Jun 2017 06:04:14 +0200 Subject: nohz: Fix spurious warning when hrtimer and clockevent get out of sync The sanity check ensuring that the tick expiry cache (ts->next_tick) is actually in sync with the hardware clock (dev->next_event) makes the wrong assumption that the clock can't be programmed later than the hrtimer deadline. In fact the clock hardware can be programmed later on some conditions such as: * The hrtimer deadline is already in the past. * The hrtimer deadline is earlier than the minimum delay supported by the hardware. Such conditions can be met when we program the tick, for example if the last jiffies update hasn't been seen by the current CPU yet, we may program the hrtimer to a deadline that is earlier than ktime_get() because last_jiffies_update is our timestamp base to compute the next tick. As a result, we can randomly observe such warning: WARNING: CPU: 5 PID: 0 at kernel/time/tick-sched.c:794 tick_nohz_stop_sched_tick kernel/time/tick-sched.c:791 [inline] Call Trace: tick_nohz_irq_exit tick_irq_exit irq_exit exiting_irq smp_call_function_interrupt smp_call_function_single_interrupt call_function_single_interrupt Therefore, let's rather make sure that the tick expiry cache is sync'ed with the tick hrtimer deadline, against which it is not supposed to drift away. The clock hardware instead has its own will and can't be used as a reliable comparison point. Reported-and-tested-by: Sasha Levin Reported-and-tested-by: Abdul Haleem Signed-off-by: Frederic Weisbecker Cc: James Hartsock Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tim Wright Link: http://lkml.kernel.org/r/1497326654-14122-1-git-send-email-fweisbec@gmail.com [ Minor readability edit. ] Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9d31f1e0067b..204600986e0d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -768,7 +768,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ - if (likely(dev->next_event <= ts->next_tick)) + if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) goto out; WARN_ON_ONCE(1); @@ -806,8 +806,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, goto out; } + hrtimer_set_expires(&ts->sched_timer, tick); + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); out: -- cgit v1.2.3 From 6a5ae63a0cc5d3ac73a96cb412fde66f3a71f98e Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 8 Jun 2017 19:53:24 -0700 Subject: tracing: Remove unused declaration of trace_stop_cmdline_recording trace_stop_cmdline_recording declaration isn't in use, remove it. Link: http://lkml.kernel.org/r/20170609025327.9508-2-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1122f151466f..63deff9cdf2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1910,8 +1910,6 @@ static void tracing_stop_tr(struct trace_array *tr) raw_spin_unlock_irqrestore(&tr->start_lock, flags); } -void trace_stop_cmdline_recording(void); - static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; -- cgit v1.2.3 From f4e981cba2dec675d40ac4f270b7e8ac164c9004 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Wed, 24 May 2017 07:49:50 +0200 Subject: printk: add __printf attributes to internal functions When compiling with -Wsuggest-attribute=format, gcc complains that some functions in kernel/printk/printk_safe.c transmit their argument to printf-like functions without having a printf attribute. Silence these warnings by adding relevant __printf attributes. Link: http://lkml.kernel.org/r/20170524054950.6722-1-nicolas.iooss_linux@m4x.org Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Signed-off-by: Nicolas Iooss Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk_safe.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 03a42a539b20..3cdaeaef9ce1 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s) * happen, printk_safe_log_store() will notice the buffer->len mismatch * and repeat the write. */ -static int printk_safe_log_store(struct printk_safe_seq_buf *s, - const char *fmt, va_list args) +static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, + const char *fmt, va_list args) { int add; size_t len; @@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void) * one writer running. But the buffer might get flushed from another * CPU, so we need to be careful. */ -static int vprintk_nmi(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) { struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); @@ -330,7 +330,7 @@ void printk_nmi_exit(void) #else -static int vprintk_nmi(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) { return 0; } @@ -342,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args) * into itself. It uses a per-CPU buffer to store the message, just like * NMI. */ -static int vprintk_safe(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) { struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); -- cgit v1.2.3 From c81be52a3ac0267aa830a2c4cb769030ea3483c9 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 12 Jun 2017 09:35:24 -0400 Subject: audit: fix a race condition with the auditd tracking code Originally reported by Adam and Dusty, it appears we have a small race window in kauditd_thread(), as documented in the Fedora BZ: * https://bugzilla.redhat.com/show_bug.cgi?id=1459326#c35 "This issue is partly due to the read-copy nature of RCU, and partly due to how we sync the auditd_connection state across kauditd_thread and the audit control channel. The kauditd_thread thread is always running so it can service the record queues and emit the multicast messages, if it happens to be just past the "main_queue" label, but before the "if (sk == NULL || ...)" if-statement which calls auditd_reset() when the new auditd connection is registered it could end up resetting the auditd connection, regardless of if it is valid or not. This is a rather small window and the variable nature of multi-core scheduling explains why this is proving rather difficult to reproduce." The fix is to have functions only call auditd_reset() when they believe that the kernel/auditd connection is still valid, e.g. non-NULL, and to have these callers pass their local copy of the auditd_connection pointer to auditd_reset() where it can be compared with the current connection state before resetting. If the caller has a stale state tracking pointer then the reset is ignored. We also make a small change to kauditd_thread() so that if the kernel/auditd connection is dead we skip the retry queue and send the records straight to the hold queue. This is necessary as we used to rely on auditd_reset() to occasionally purge the retry queue but we are going to be calling the reset function much less now and we want to make sure the retry queue doesn't grow unbounded. Reported-by: Adam Williamson Reported-by: Dusty Mabe Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b2e877100242..e1e2b3abfb93 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb) /** * auditd_reset - Disconnect the auditd connection + * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the - * hold queue in case auditd reconnects. + * hold queue in case auditd reconnects. It is important to note that the @ac + * pointer should never be dereferenced inside this function as it may be NULL + * or invalid, you can only compare the memory address! If @ac is NULL then + * the connection will always be reset. */ -static void auditd_reset(void) +static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; @@ -590,6 +594,11 @@ static void auditd_reset(void) spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); + if (ac && ac != ac_old) { + /* someone already registered a new auditd connection */ + spin_unlock_irqrestore(&auditd_conn_lock, flags); + return; + } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); @@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) return rc; err: - if (rc == -ECONNREFUSED) - auditd_reset(); + if (ac && rc == -ECONNREFUSED) + auditd_reset(ac); return rc; } @@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); - if (rc < 0) { + if (ac && rc < 0) { sk = NULL; - auditd_reset(); + auditd_reset(ac); goto main_queue; } @@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); - if (rc < 0) { + if (ac && rc < 0) { sk = NULL; - auditd_reset(); + auditd_reset(ac); goto main_queue; } @@ -815,12 +824,13 @@ main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the - * multicast send and move the record to the retry queue */ + * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, - kauditd_retry_skb); - if (sk == NULL || rc < 0) - auditd_reset(); + (sk ? + kauditd_retry_skb : kauditd_hold_skb)); + if (ac && rc < 0) + auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ @@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) auditd_pid, 1); /* unregister the auditd connection */ - auditd_reset(); + auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { -- cgit v1.2.3 From 02fd7f68f5342bc7e8054cb05ea4a07f26d41d12 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:42 -0500 Subject: trace: rename kernel enum section to eval The kernel and its modules have sections containing the enum string to value conversions. Rename this section because we intend to store more than enums in it. Link: http://lkml.kernel.org/r/20170531215653.3240-2-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 2 +- kernel/trace/trace.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..9ec4713c5eee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3077,7 +3077,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - mod->trace_enums = section_objs(info, "_ftrace_enum_map", + mod->trace_enums = section_objs(info, "_ftrace_eval_map", sizeof(*mod->trace_enums), &mod->num_trace_enums); #endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 63deff9cdf2c..acd3eb4d56a0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7732,15 +7732,15 @@ struct dentry *tracing_init_dentry(void) return NULL; } -extern struct trace_enum_map *__start_ftrace_enum_maps[]; -extern struct trace_enum_map *__stop_ftrace_enum_maps[]; +extern struct trace_enum_map *__start_ftrace_eval_maps[]; +extern struct trace_enum_map *__stop_ftrace_eval_maps[]; static void __init trace_enum_init(void) { int len; - len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; - trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); + len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; + trace_insert_enum_map(NULL, __start_ftrace_eval_maps, len); } #ifdef CONFIG_MODULES -- cgit v1.2.3 From 00f4b652b6f1dbfd4e1d5419d7f1cc23b1374da8 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:43 -0500 Subject: trace: rename trace_enum_map to trace_eval_map Each enum is loaded into the trace_enum_map, as we are now using this for more than enums rename it. Link: http://lkml.kernel.org/r/20170531215653.3240-3-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 24 ++++++++++++------------ kernel/trace/trace.h | 4 ++-- kernel/trace/trace_events.c | 14 +++++++------- 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index acd3eb4d56a0..46fac3f63af1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -132,7 +132,7 @@ union trace_enum_map_item; struct trace_enum_map_tail { /* * "end" is first and points to NULL as it must be different - * than "mod" or "enum_string" + * than "mod" or "eval_string" */ union trace_enum_map_item *next; const char *end; /* points to NULL */ @@ -148,7 +148,7 @@ static DEFINE_MUTEX(trace_enum_mutex); * pointer to the next array of saved enum_map items. */ union trace_enum_map_item { - struct trace_enum_map map; + struct trace_eval_map map; struct trace_enum_map_head head; struct trace_enum_map_tail tail; }; @@ -4748,7 +4748,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { static union trace_enum_map_item * update_enum_map(union trace_enum_map_item *ptr) { - if (!ptr->map.enum_string) { + if (!ptr->map.eval_string) { if (ptr->tail.next) { ptr = ptr->tail.next; /* Set ptr to the next real item (skip head) */ @@ -4808,7 +4808,7 @@ static int enum_map_show(struct seq_file *m, void *v) union trace_enum_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", - ptr->map.enum_string, ptr->map.enum_value, + ptr->map.eval_string, ptr->map.eval_value, ptr->map.system); return 0; @@ -4844,11 +4844,11 @@ trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) } static void -trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, +trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, int len) { - struct trace_enum_map **stop; - struct trace_enum_map **map; + struct trace_eval_map **stop; + struct trace_eval_map **map; union trace_enum_map_item *map_array; union trace_enum_map_item *ptr; @@ -4902,13 +4902,13 @@ static void trace_create_enum_file(struct dentry *d_tracer) #else /* CONFIG_TRACE_ENUM_MAP_FILE */ static inline void trace_create_enum_file(struct dentry *d_tracer) { } static inline void trace_insert_enum_map_file(struct module *mod, - struct trace_enum_map **start, int len) { } + struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ static void trace_insert_enum_map(struct module *mod, - struct trace_enum_map **start, int len) + struct trace_eval_map **start, int len) { - struct trace_enum_map **map; + struct trace_eval_map **map; if (len <= 0) return; @@ -7732,8 +7732,8 @@ struct dentry *tracing_init_dentry(void) return NULL; } -extern struct trace_enum_map *__start_ftrace_eval_maps[]; -extern struct trace_enum_map *__stop_ftrace_eval_maps[]; +extern struct trace_eval_map *__start_ftrace_eval_maps[]; +extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static void __init trace_enum_init(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 39fd77330aab..a9667297ae49 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1773,10 +1773,10 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_enum_update(struct trace_enum_map **map, int len); +void trace_event_enum_update(struct trace_eval_map **map, int len); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } +static inline void trace_event_enum_update(struct trace_eval_map **map, int len) { } #endif extern struct trace_iterator *tracepoint_print_iter; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e7973e10398c..cf5b9aa4d732 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2067,18 +2067,18 @@ __register_event(struct trace_event_call *call, struct module *mod) return 0; } -static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) +static char *enum_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; /* Find the length of the enum value as a string */ - elen = snprintf(ptr, 0, "%ld", map->enum_value); + elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) return NULL; - snprintf(ptr, elen + 1, "%ld", map->enum_value); + snprintf(ptr, elen + 1, "%ld", map->eval_value); /* Get the rest of the string of ptr */ rlen = strlen(ptr + len); @@ -2090,11 +2090,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) } static void update_event_printk(struct trace_event_call *call, - struct trace_enum_map *map) + struct trace_eval_map *map) { char *ptr; int quote = 0; - int len = strlen(map->enum_string); + int len = strlen(map->eval_string); for (ptr = call->print_fmt; *ptr; ptr++) { if (*ptr == '\\') { @@ -2125,7 +2125,7 @@ static void update_event_printk(struct trace_event_call *call, continue; } if (isalpha(*ptr) || *ptr == '_') { - if (strncmp(map->enum_string, ptr, len) == 0 && + if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { ptr = enum_replace(ptr, map, len); /* Hmm, enum string smaller than value */ @@ -2165,7 +2165,7 @@ static void update_event_printk(struct trace_event_call *call, } } -void trace_event_enum_update(struct trace_enum_map **map, int len) +void trace_event_enum_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; -- cgit v1.2.3 From 99be647c5841d570a23b5dfa65bfecada8b6e6b5 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:44 -0500 Subject: trace: rename struct module entry for trace enums Each module has a list of enum's its contributing to the enum map, rename that entry to reflect its use by more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-4-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 6 +++--- kernel/trace/trace.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 9ec4713c5eee..df1c4a9e7abb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3077,9 +3077,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - mod->trace_enums = section_objs(info, "_ftrace_eval_map", - sizeof(*mod->trace_enums), - &mod->num_trace_enums); + mod->trace_evals = section_objs(info, "_ftrace_eval_map", + sizeof(*mod->trace_evals), + &mod->num_trace_evals); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 46fac3f63af1..061abd8ba101 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7746,7 +7746,7 @@ static void __init trace_enum_init(void) #ifdef CONFIG_MODULES static void trace_module_add_enums(struct module *mod) { - if (!mod->num_trace_enums) + if (!mod->num_trace_evals) return; /* @@ -7756,7 +7756,7 @@ static void trace_module_add_enums(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); + trace_insert_enum_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_ENUM_MAP_FILE @@ -7765,7 +7765,7 @@ static void trace_module_remove_enums(struct module *mod) union trace_enum_map_item *map; union trace_enum_map_item **last = &trace_enum_maps; - if (!mod->num_trace_enums) + if (!mod->num_trace_evals) return; mutex_lock(&trace_enum_mutex); -- cgit v1.2.3 From 23bf8cb8dc86c0368d2471ebb4622e7edd38190b Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:45 -0500 Subject: trace: rename trace enum data structures in trace.c The enum map entries can be exported to userspace via a sys enum_map file. Rename those functions and structures to reflect the fact that we are using them for more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-5-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 061abd8ba101..13c81f4f2bd7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -122,38 +122,38 @@ int __disable_trace_on_warning; #ifdef CONFIG_TRACE_ENUM_MAP_FILE /* Map of enums to their values, for "enum_map" file */ -struct trace_enum_map_head { +struct trace_eval_map_head { struct module *mod; unsigned long length; }; -union trace_enum_map_item; +union trace_eval_map_item; -struct trace_enum_map_tail { +struct trace_eval_map_tail { /* * "end" is first and points to NULL as it must be different * than "mod" or "eval_string" */ - union trace_enum_map_item *next; + union trace_eval_map_item *next; const char *end; /* points to NULL */ }; static DEFINE_MUTEX(trace_enum_mutex); /* - * The trace_enum_maps are saved in an array with two extra elements, + * The trace_eval_maps are saved in an array with two extra elements, * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a * pointer to the next array of saved enum_map items. */ -union trace_enum_map_item { +union trace_eval_map_item { struct trace_eval_map map; - struct trace_enum_map_head head; - struct trace_enum_map_tail tail; + struct trace_eval_map_head head; + struct trace_eval_map_tail tail; }; -static union trace_enum_map_item *trace_enum_maps; +static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); @@ -4745,8 +4745,8 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { }; #ifdef CONFIG_TRACE_ENUM_MAP_FILE -static union trace_enum_map_item * -update_enum_map(union trace_enum_map_item *ptr) +static union trace_eval_map_item * +update_enum_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { @@ -4761,7 +4761,7 @@ update_enum_map(union trace_enum_map_item *ptr) static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) { - union trace_enum_map_item *ptr = v; + union trace_eval_map_item *ptr = v; /* * Paranoid! If ptr points to end, we don't want to increment past it. @@ -4782,12 +4782,12 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) static void *enum_map_start(struct seq_file *m, loff_t *pos) { - union trace_enum_map_item *v; + union trace_eval_map_item *v; loff_t l = 0; mutex_lock(&trace_enum_mutex); - v = trace_enum_maps; + v = trace_eval_maps; if (v) v++; @@ -4805,7 +4805,7 @@ static void enum_map_stop(struct seq_file *m, void *v) static int enum_map_show(struct seq_file *m, void *v) { - union trace_enum_map_item *ptr = v; + union trace_eval_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", ptr->map.eval_string, ptr->map.eval_value, @@ -4836,8 +4836,8 @@ static const struct file_operations tracing_enum_map_fops = { .release = seq_release, }; -static inline union trace_enum_map_item * -trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) +static inline union trace_eval_map_item * +trace_enum_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; @@ -4849,13 +4849,13 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, { struct trace_eval_map **stop; struct trace_eval_map **map; - union trace_enum_map_item *map_array; - union trace_enum_map_item *ptr; + union trace_eval_map_item *map_array; + union trace_eval_map_item *ptr; stop = start + len; /* - * The trace_enum_maps contains the map plus a head and tail item, + * The trace_eval_maps contains the map plus a head and tail item, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ @@ -4867,10 +4867,10 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, mutex_lock(&trace_enum_mutex); - if (!trace_enum_maps) - trace_enum_maps = map_array; + if (!trace_eval_maps) + trace_eval_maps = map_array; else { - ptr = trace_enum_maps; + ptr = trace_eval_maps; for (;;) { ptr = trace_enum_jmp_to_tail(ptr); if (!ptr->tail.next) @@ -7762,15 +7762,15 @@ static void trace_module_add_enums(struct module *mod) #ifdef CONFIG_TRACE_ENUM_MAP_FILE static void trace_module_remove_enums(struct module *mod) { - union trace_enum_map_item *map; - union trace_enum_map_item **last = &trace_enum_maps; + union trace_eval_map_item *map; + union trace_eval_map_item **last = &trace_eval_maps; if (!mod->num_trace_evals) return; mutex_lock(&trace_enum_mutex); - map = trace_enum_maps; + map = trace_eval_maps; while (map) { if (map->head.mod == mod) -- cgit v1.2.3 From 1793ed939b2a0e18b06467e10d15b66925d75d5f Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:46 -0500 Subject: trace: rename trace_enum_mutex to trace_eval_mutex There is a lock protecting the trace_enum_map, rename it to reflect the use by more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-6-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 13c81f4f2bd7..e0279f5dc83f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -138,7 +138,7 @@ struct trace_eval_map_tail { const char *end; /* points to NULL */ }; -static DEFINE_MUTEX(trace_enum_mutex); +static DEFINE_MUTEX(trace_eval_mutex); /* * The trace_eval_maps are saved in an array with two extra elements, @@ -4785,7 +4785,7 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) union trace_eval_map_item *v; loff_t l = 0; - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); v = trace_eval_maps; if (v) @@ -4800,7 +4800,7 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) static void enum_map_stop(struct seq_file *m, void *v) { - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } static int enum_map_show(struct seq_file *m, void *v) @@ -4865,7 +4865,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, return; } - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; @@ -4890,7 +4890,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, } memset(map_array, 0, sizeof(*map_array)); - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } static void trace_create_enum_file(struct dentry *d_tracer) @@ -7768,7 +7768,7 @@ static void trace_module_remove_enums(struct module *mod) if (!mod->num_trace_evals) return; - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); map = trace_eval_maps; @@ -7785,7 +7785,7 @@ static void trace_module_remove_enums(struct module *mod) *last = trace_enum_jmp_to_tail(map)->tail.next; kfree(map); out: - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } #else static inline void trace_module_remove_enums(struct module *mod) { } -- cgit v1.2.3 From 5f60b351a7e37bf243725bcc131708be2c8ea497 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:47 -0500 Subject: trace: rename trace.c enum functions Rename the init and trace_enum_jmp_to_tail() routines to reflect their use by more than enumerated types. Link: http://lkml.kernel.org/r/20170531215653.3240-7-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e0279f5dc83f..d703f429bbd9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4837,7 +4837,7 @@ static const struct file_operations tracing_enum_map_fops = { }; static inline union trace_eval_map_item * -trace_enum_jmp_to_tail(union trace_eval_map_item *ptr) +trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; @@ -4872,7 +4872,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, else { ptr = trace_eval_maps; for (;;) { - ptr = trace_enum_jmp_to_tail(ptr); + ptr = trace_eval_jmp_to_tail(ptr); if (!ptr->tail.next) break; ptr = ptr->tail.next; @@ -7735,7 +7735,7 @@ struct dentry *tracing_init_dentry(void) extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static void __init trace_enum_init(void) +static void __init trace_eval_init(void) { int len; @@ -7775,14 +7775,14 @@ static void trace_module_remove_enums(struct module *mod) while (map) { if (map->head.mod == mod) break; - map = trace_enum_jmp_to_tail(map); + map = trace_eval_jmp_to_tail(map); last = &map->tail.next; map = map->tail.next; } if (!map) goto out; - *last = trace_enum_jmp_to_tail(map)->tail.next; + *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); out: mutex_unlock(&trace_eval_mutex); @@ -7839,7 +7839,7 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); - trace_enum_init(); + trace_eval_init(); trace_create_enum_file(d_tracer); -- cgit v1.2.3 From f57a41434fc51732dd5e35e0e1aa9e607f1a05d6 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:48 -0500 Subject: trace: rename enum_map functions Rename the core trace enum routines to use eval, to reflect their use by more than just enum to value mapping. Link: http://lkml.kernel.org/r/20170531215653.3240-8-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 74 ++++++++++++++++++++++----------------------- kernel/trace/trace.h | 4 +-- kernel/trace/trace_events.c | 2 +- 3 files changed, 40 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d703f429bbd9..d830be7f0ba6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -145,7 +145,7 @@ static DEFINE_MUTEX(trace_eval_mutex); * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a - * pointer to the next array of saved enum_map items. + * pointer to the next array of saved enum_eval/enum_map items. */ union trace_eval_map_item { struct trace_eval_map map; @@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) /* * TRACE_FLAGS is defined as a tuple matching bit masks with strings. - * It uses C(a, b) where 'a' is the enum name and 'b' is the string that + * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list - * of strings in the order that the enums were defined. + * of strings in the order that the evals (enum) were defined. */ #undef C #define C(a, b) b @@ -4746,7 +4746,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { #ifdef CONFIG_TRACE_ENUM_MAP_FILE static union trace_eval_map_item * -update_enum_map(union trace_eval_map_item *ptr) +update_eval_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { @@ -4759,7 +4759,7 @@ update_enum_map(union trace_eval_map_item *ptr) return ptr; } -static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) +static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) { union trace_eval_map_item *ptr = v; @@ -4767,7 +4767,7 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) * Paranoid! If ptr points to end, we don't want to increment past it. * This really should never happen. */ - ptr = update_enum_map(ptr); + ptr = update_eval_map(ptr); if (WARN_ON_ONCE(!ptr)) return NULL; @@ -4775,12 +4775,12 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - ptr = update_enum_map(ptr); + ptr = update_eval_map(ptr); return ptr; } -static void *enum_map_start(struct seq_file *m, loff_t *pos) +static void *eval_map_start(struct seq_file *m, loff_t *pos) { union trace_eval_map_item *v; loff_t l = 0; @@ -4792,18 +4792,18 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) v++; while (v && l < *pos) { - v = enum_map_next(m, v, &l); + v = eval_map_next(m, v, &l); } return v; } -static void enum_map_stop(struct seq_file *m, void *v) +static void eval_map_stop(struct seq_file *m, void *v) { mutex_unlock(&trace_eval_mutex); } -static int enum_map_show(struct seq_file *m, void *v) +static int eval_map_show(struct seq_file *m, void *v) { union trace_eval_map_item *ptr = v; @@ -4814,23 +4814,23 @@ static int enum_map_show(struct seq_file *m, void *v) return 0; } -static const struct seq_operations tracing_enum_map_seq_ops = { - .start = enum_map_start, - .next = enum_map_next, - .stop = enum_map_stop, - .show = enum_map_show, +static const struct seq_operations tracing_eval_map_seq_ops = { + .start = eval_map_start, + .next = eval_map_next, + .stop = eval_map_stop, + .show = eval_map_show, }; -static int tracing_enum_map_open(struct inode *inode, struct file *filp) +static int tracing_eval_map_open(struct inode *inode, struct file *filp) { if (tracing_disabled) return -ENODEV; - return seq_open(filp, &tracing_enum_map_seq_ops); + return seq_open(filp, &tracing_eval_map_seq_ops); } -static const struct file_operations tracing_enum_map_fops = { - .open = tracing_enum_map_open, +static const struct file_operations tracing_eval_map_fops = { + .open = tracing_eval_map_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -4844,7 +4844,7 @@ trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) } static void -trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, +trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **stop; @@ -4861,7 +4861,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warn("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace eval mapping\n"); return; } @@ -4893,19 +4893,19 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, mutex_unlock(&trace_eval_mutex); } -static void trace_create_enum_file(struct dentry *d_tracer) +static void trace_create_eval_file(struct dentry *d_tracer) { trace_create_file("enum_map", 0444, d_tracer, - NULL, &tracing_enum_map_fops); + NULL, &tracing_eval_map_fops); } #else /* CONFIG_TRACE_ENUM_MAP_FILE */ -static inline void trace_create_enum_file(struct dentry *d_tracer) { } -static inline void trace_insert_enum_map_file(struct module *mod, +static inline void trace_create_eval_file(struct dentry *d_tracer) { } +static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ -static void trace_insert_enum_map(struct module *mod, +static void trace_insert_eval_map(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **map; @@ -4915,9 +4915,9 @@ static void trace_insert_enum_map(struct module *mod, map = start; - trace_event_enum_update(map, len); + trace_event_eval_update(map, len); - trace_insert_enum_map_file(mod, start, len); + trace_insert_eval_map_file(mod, start, len); } static ssize_t @@ -7740,11 +7740,11 @@ static void __init trace_eval_init(void) int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; - trace_insert_enum_map(NULL, __start_ftrace_eval_maps, len); + trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } #ifdef CONFIG_MODULES -static void trace_module_add_enums(struct module *mod) +static void trace_module_add_evals(struct module *mod) { if (!mod->num_trace_evals) return; @@ -7756,11 +7756,11 @@ static void trace_module_add_enums(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_enum_map(mod, mod->trace_evals, mod->num_trace_evals); + trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_ENUM_MAP_FILE -static void trace_module_remove_enums(struct module *mod) +static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; union trace_eval_map_item **last = &trace_eval_maps; @@ -7788,7 +7788,7 @@ static void trace_module_remove_enums(struct module *mod) mutex_unlock(&trace_eval_mutex); } #else -static inline void trace_module_remove_enums(struct module *mod) { } +static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, @@ -7798,10 +7798,10 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: - trace_module_add_enums(mod); + trace_module_add_evals(mod); break; case MODULE_STATE_GOING: - trace_module_remove_enums(mod); + trace_module_remove_evals(mod); break; } @@ -7841,7 +7841,7 @@ static __init int tracer_init_tracefs(void) trace_eval_init(); - trace_create_enum_file(d_tracer); + trace_create_eval_file(d_tracer); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a9667297ae49..69a3ab3ee4f5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1773,10 +1773,10 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_enum_update(struct trace_eval_map **map, int len); +void trace_event_eval_update(struct trace_eval_map **map, int len); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_enum_update(struct trace_eval_map **map, int len) { } +static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif extern struct trace_iterator *tracepoint_print_iter; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index cf5b9aa4d732..e6897b005947 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2165,7 +2165,7 @@ static void update_event_printk(struct trace_event_call *call, } } -void trace_event_enum_update(struct trace_eval_map **map, int len) +void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; -- cgit v1.2.3 From 67ec0d85955630924b971e04c0954370a74b8706 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:49 -0500 Subject: tracing: Rename enum_replace to eval_replace The enum_replace stanza works as is for sizeof() calls as well as enums. Rename it as well. Link: http://lkml.kernel.org/r/20170531215653.3240-9-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e6897b005947..83dfd0dbbbfe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2067,12 +2067,12 @@ __register_event(struct trace_event_call *call, struct module *mod) return 0; } -static char *enum_replace(char *ptr, struct trace_eval_map *map, int len) +static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; - /* Find the length of the enum value as a string */ + /* Find the length of the eval value as a string */ elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) @@ -2127,14 +2127,14 @@ static void update_event_printk(struct trace_event_call *call, if (isalpha(*ptr) || *ptr == '_') { if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { - ptr = enum_replace(ptr, map, len); - /* Hmm, enum string smaller than value */ + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) return; /* - * No need to decrement here, as enum_replace() + * No need to decrement here, as eval_replace() * returns the pointer to the character passed - * the enum, and two enums can not be placed + * the eval, and two evals can not be placed * back to back without something in between. * We can skip that something in between. */ -- cgit v1.2.3 From 681bec0367c2606b6154060310a2ffa543175980 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:53 -0500 Subject: tracing: Rename update the enum_map file The enum_map file is used to display a list of symbol to name conversions. As its now used to resolve sizeof lets update the name and description. Link: http://lkml.kernel.org/r/20170531215653.3240-13-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 22 +++++++++++----------- kernel/trace/trace.c | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7e06f04e98fe..434c840e2d82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N -config TRACE_ENUM_MAP_FILE - bool "Show enum mappings for trace events" +config TRACE_EVAL_MAP_FILE + bool "Show eval mappings for trace events" depends on TRACING help - The "print fmt" of the trace events will show the enum names instead - of their values. This can cause problems for user space tools that - use this string to parse the raw data as user space does not know + The "print fmt" of the trace events will show the enum/sizeof names + instead of their values. This can cause problems for user space tools + that use this string to parse the raw data as user space does not know how to convert the string to its value. To fix this, there's a special macro in the kernel that can be used - to convert the enum into its value. If this macro is used, then the - print fmt strings will have the enums converted to their values. + to convert an enum/sizeof into its value. If this macro is used, then + the print fmt strings will be converted to their values. If something does not get converted properly, this option can be - used to show what enums the kernel tried to convert. + used to show what enums/sizeof the kernel tried to convert. - This option is for debugging the enum conversions. A file is created - in the tracing directory called "enum_map" that will show the enum + This option is for debugging the conversions. A file is created + in the tracing directory called "eval_map" that will show the names matched with their values and what trace event system they belong too. Normally, the mapping of the strings to values will be freed after boot up or module load. With this option, they will not be freed, as - they are needed for the "enum_map" file. Enabling this option will + they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. If unsure, say N diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d830be7f0ba6..19ac2088d10a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -120,8 +120,8 @@ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; -#ifdef CONFIG_TRACE_ENUM_MAP_FILE -/* Map of enums to their values, for "enum_map" file */ +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +/* Map of enums to their values, for "eval_map" file */ struct trace_eval_map_head { struct module *mod; unsigned long length; @@ -145,7 +145,7 @@ static DEFINE_MUTEX(trace_eval_mutex); * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a - * pointer to the next array of saved enum_eval/enum_map items. + * pointer to the next array of saved eval_map items. */ union trace_eval_map_item { struct trace_eval_map map; @@ -154,7 +154,7 @@ union trace_eval_map_item { }; static union trace_eval_map_item *trace_eval_maps; -#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); @@ -4744,7 +4744,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { .write = tracing_saved_cmdlines_size_write, }; -#ifdef CONFIG_TRACE_ENUM_MAP_FILE +#ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) { @@ -4895,15 +4895,15 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, static void trace_create_eval_file(struct dentry *d_tracer) { - trace_create_file("enum_map", 0444, d_tracer, + trace_create_file("eval_map", 0444, d_tracer, NULL, &tracing_eval_map_fops); } -#else /* CONFIG_TRACE_ENUM_MAP_FILE */ +#else /* CONFIG_TRACE_EVAL_MAP_FILE */ static inline void trace_create_eval_file(struct dentry *d_tracer) { } static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } -#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ static void trace_insert_eval_map(struct module *mod, struct trace_eval_map **start, int len) @@ -7759,7 +7759,7 @@ static void trace_module_add_evals(struct module *mod) trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } -#ifdef CONFIG_TRACE_ENUM_MAP_FILE +#ifdef CONFIG_TRACE_EVAL_MAP_FILE static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; @@ -7789,7 +7789,7 @@ static void trace_module_remove_evals(struct module *mod) } #else static inline void trace_module_remove_evals(struct module *mod) { } -#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) -- cgit v1.2.3 From 86a9c446c13ecd8793ea8599761322aed125d542 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:26 +0100 Subject: posix-cpu-timers: Move copyout of timespec into do_cpu_nanosleep() The posix-cpu-timer nanosleep() implementation can be simplified by moving the copy out of the remaining time to do_cpu_nanosleep() which is shared between the real nanosleep function and the restart function. The pointer to the timespec64 which is updated has to be stored in the restart block anyway. Instead of storing it only in the restart case, store it before calling do_cpu_nanosleep() and copy the remaining time in the signal exit path. [ tglx: Added changelog ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-1-viro@ZenIV.linux.org.uk --- kernel/time/posix-cpu-timers.c | 63 +++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cb4a4eb44279..239fff980fd0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1226,9 +1226,10 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, struct itimerspec64 *it) + struct timespec64 *rqtp) { struct k_itimer timer; + struct itimerspec64 it; int error; /* @@ -1242,12 +1243,14 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_process = current; if (!error) { static struct itimerspec64 zero_it; + struct restart_block *restart = ¤t->restart_block; + struct timespec __user *rmtp; - memset(it, 0, sizeof *it); - it->it_value = *rqtp; + memset(&it, 0, sizeof it); + it.it_value = *rqtp; spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, it, NULL); + error = posix_cpu_timer_set(&timer, flags, &it, NULL); if (error) { spin_unlock_irq(&timer.it_lock); return error; @@ -1277,7 +1280,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * We were interrupted by a signal. */ *rqtp = ns_to_timespec64(timer.it.cpu.expires); - error = posix_cpu_timer_set(&timer, 0, &zero_it, it); + error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { /* * Timer is now unarmed, deletion can not fail. @@ -1297,7 +1300,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, spin_unlock_irq(&timer.it_lock); } - if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { + if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { /* * It actually did fire already. */ @@ -1305,6 +1308,18 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } error = -ERESTART_RESTARTBLOCK; + /* + * Report back to the user the time still remaining. + */ + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + struct timespec ts; + + ts = timespec64_to_timespec(it.it_value); + if (copy_to_user(rmtp, &ts, sizeof(*rmtp))) + return -EFAULT; + } + restart->nanosleep.expires = timespec64_to_ns(rqtp); } return error; @@ -1316,10 +1331,13 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec64 *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = ¤t->restart_block; - struct itimerspec64 it; - struct timespec ts; int error; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + + restart_block->nanosleep.rmtp = rmtp; + /* * Diagnose required errors first. */ @@ -1328,23 +1346,15 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) return -EINVAL; - error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + error = do_cpu_nanosleep(which_clock, flags, rqtp); if (error == -ERESTART_RESTARTBLOCK) { if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; - /* - * Report back to the user the time still remaining. - */ - ts = timespec64_to_timespec(it.it_value); - if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp))) - return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; - restart_block->nanosleep.rmtp = rmtp; - restart_block->nanosleep.expires = timespec64_to_ns(rqtp); } return error; } @@ -1352,28 +1362,11 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct itimerspec64 it; struct timespec64 t; - struct timespec tmp; - int error; t = ns_to_timespec64(restart_block->nanosleep.expires); - error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - struct timespec __user *rmtp = restart_block->nanosleep.rmtp; - /* - * Report back to the user the time still remaining. - */ - tmp = timespec64_to_timespec(it.it_value); - if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp))) - return -EFAULT; - - restart_block->nanosleep.expires = timespec64_to_ns(&t); - } - return error; - + return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) -- cgit v1.2.3 From 15f27ce24cb613e6e01ce27c4094c55e55dde5d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:27 +0100 Subject: alarmtimer: Move copyout and freeze handling into alarmtimer_do_nsleep() The alarmtimer nanosleep() implementation can be simplified by moving the copy out of the remaining time to alarmtimer_do_nsleep() which is shared between the real nanosleep function and the restart function. The pointer to the timespec64 which is updated has to be stored in the restart block anyway. Instead of storing it only in the restart case, store it before calling alarmtimer_do_nsleep() and copy the remaining time in the signal exit path. [ tglx: Added changelog ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-2-viro@ZenIV.linux.org.uk --- kernel/time/alarmtimer.c | 102 +++++++++++++++-------------------------------- 1 file changed, 32 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index d8a7a7e214de..ac6e9bc6cc59 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -688,8 +688,10 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, * * Sets the alarm timer and sleeps until it is fired or interrupted. */ -static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, + enum alarmtimer_type type) { + struct timespec __user *rmtp; alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); @@ -702,36 +704,26 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) __set_current_state(TASK_RUNNING); - return (alarm->data == NULL); -} - - -/** - * update_rmtp - Update remaining timespec value - * @exp: expiration time - * @type: timer type - * @rmtp: user pointer to remaining timepsec value - * - * Helper function that fills in rmtp value with time between - * now and the exp value - */ -static int update_rmtp(ktime_t exp, enum alarmtimer_type type, - struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = ktime_sub(exp, alarm_bases[type].gettime()); - - if (rem <= 0) + if (!alarm->data) return 0; - rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; + if (freezing(current)) + alarmtimer_freezerset(absexp, type); + rmtp = current->restart_block.nanosleep.rmtp; + if (rmtp) { + struct timespec rmt; + ktime_t rem; - return 1; + rem = ktime_sub(absexp, alarm_bases[type].gettime()); + if (rem <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + } + return -ERESTART_RESTARTBLOCK; } /** @@ -743,32 +735,12 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type, static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) { enum alarmtimer_type type = restart->nanosleep.clockid; - ktime_t exp; - struct timespec __user *rmtp; + ktime_t exp = restart->nanosleep.expires; struct alarm alarm; - int ret = 0; - exp = restart->nanosleep.expires; alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); - if (alarmtimer_do_nsleep(&alarm, exp)) - goto out; - - if (freezing(current)) - alarmtimer_freezerset(exp, type); - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(exp, type, rmtp); - if (ret <= 0) - goto out; - } - - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: - return ret; + return alarmtimer_do_nsleep(&alarm, exp, type); } /** @@ -785,11 +757,16 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, struct timespec __user *rmtp) { enum alarmtimer_type type = clock2alarm(which_clock); - struct restart_block *restart; + struct restart_block *restart = ¤t->restart_block; struct alarm alarm; ktime_t exp; int ret = 0; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + + restart->nanosleep.rmtp = rmtp; + if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -808,32 +785,17 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, exp = ktime_add(now, exp); } - if (alarmtimer_do_nsleep(&alarm, exp)) - goto out; - - if (freezing(current)) - alarmtimer_freezerset(exp, type); + ret = alarmtimer_do_nsleep(&alarm, exp, type); + if (ret != -ERESTART_RESTARTBLOCK) + return ret; /* abs timers don't set remaining time or restart */ - if (flags == TIMER_ABSTIME) { - ret = -ERESTARTNOHAND; - goto out; - } - - if (rmtp) { - ret = update_rmtp(exp, type, rmtp); - if (ret <= 0) - goto out; - } + if (flags == TIMER_ABSTIME) + return -ERESTARTNOHAND; - restart = ¤t->restart_block; restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp; - restart->nanosleep.rmtp = rmtp; - ret = -ERESTART_RESTARTBLOCK; - -out: return ret; } -- cgit v1.2.3 From 192a82f9003fe8fabd6088aa646e829225a94c55 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:28 +0100 Subject: hrtimer_nanosleep(): Pass rmtp in restart_block Store the pointer to the timespec which gets updated with the remaining time in the restart block and remove the function argument. [ tglx: Added changelog ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-3-viro@ZenIV.linux.org.uk --- kernel/compat.c | 6 +++--- kernel/time/hrtimer.c | 11 ++++++----- kernel/time/posix-stubs.c | 5 ++++- kernel/time/posix-timers.c | 5 ++++- 4 files changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..cc9ba9d29b47 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -253,9 +253,9 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, oldfs = get_fs(); set_fs(KERNEL_DS); - ret = hrtimer_nanosleep(&tu64, - rmtp ? (struct timespec __user *)&rmt : NULL, - HRTIMER_MODE_REL, CLOCK_MONOTONIC); + current->restart_block.nanosleep.rmtp = + rmtp ? (struct timespec __user *)&rmt : NULL; + ret = hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); /* diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ac053bb5296e..4ae777f159de 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1503,10 +1503,11 @@ out: return ret; } -long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, +long hrtimer_nanosleep(struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { - struct restart_block *restart; + struct restart_block *restart = ¤t->restart_block; + struct timespec __user *rmtp; struct hrtimer_sleeper t; int ret = 0; u64 slack; @@ -1526,16 +1527,15 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, goto out; } + rmtp = restart->nanosleep.rmtp; if (rmtp) { ret = update_rmtp(&t.timer, rmtp); if (ret <= 0) goto out; } - restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.rmtp = rmtp; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); ret = -ERESTART_RESTARTBLOCK; @@ -1557,7 +1557,8 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, if (!timespec64_valid(&tu64)) return -EINVAL; - return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index c0cd53eb018a..156a5e6f3bd2 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -115,7 +115,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, t64 = timespec_to_timespec64(t); if (!timespec64_valid(&t64)) return -EINVAL; - return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ? + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); default: diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 58c0f60b132f..1a9f59f8afc2 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1043,7 +1043,10 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, static int common_nsleep(const clockid_t which_clock, int flags, struct timespec64 *tsave, struct timespec __user *rmtp) { - return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } -- cgit v1.2.3 From a7602681fc63f1a3ddd3da336296c9634c2ff974 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:29 +0100 Subject: hrtimer: Move copyout of remaining time to do_nanosleep() The hrtimer nanosleep() implementation can be simplified by moving the copy out of the remaining time to do_nanosleep() which is shared between the real nanosleep function and the restart function. The pointer to the timespec64 which is updated is already stored in the restart block at the call site, so the seperate handling of nanosleep and restart function can be avoided. [ tglx: Added changelog ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-4-viro@ZenIV.linux.org.uk --- kernel/time/hrtimer.c | 62 +++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4ae777f159de..baa7b846b6e3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1441,6 +1441,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { + struct timespec __user *rmtp; hrtimer_init_sleeper(t, current); do { @@ -1457,48 +1458,33 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod __set_current_state(TASK_RUNNING); - return t->task == NULL; -} - -static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = hrtimer_expires_remaining(timer); - if (rem <= 0) + if (!t->task) return 0; - rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - - return 1; + rmtp = current->restart_block.nanosleep.rmtp; + if (rmtp) { + struct timespec rmt; + ktime_t rem = hrtimer_expires_remaining(&t->timer); + if (rem <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + } + return -ERESTART_RESTARTBLOCK; } long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; - struct timespec __user *rmtp; - int ret = 0; + int ret; hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - if (do_nanosleep(&t, HRTIMER_MODE_ABS)) - goto out; - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: + ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } @@ -1506,8 +1492,7 @@ out: long hrtimer_nanosleep(struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { - struct restart_block *restart = ¤t->restart_block; - struct timespec __user *rmtp; + struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; u64 slack; @@ -1518,7 +1503,8 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, hrtimer_init_on_stack(&t.timer, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); - if (do_nanosleep(&t, mode)) + ret = do_nanosleep(&t, mode); + if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ @@ -1527,18 +1513,10 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, goto out; } - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - + restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); - - ret = -ERESTART_RESTARTBLOCK; out: destroy_hrtimer_on_stack(&t.timer); return ret; -- cgit v1.2.3 From 99e6c0e6ec349575886ca7daffc9cb7ec583176f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:30 +0100 Subject: posix-timers: Store rmtp into restart_block in sys_clock_nanosleep() ... instead of doing that in every ->nsleep() instance Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-5-viro@ZenIV.linux.org.uk --- kernel/time/alarmtimer.c | 8 +------- kernel/time/posix-cpu-timers.c | 12 +++--------- kernel/time/posix-timers.c | 10 +++++----- kernel/time/posix-timers.h | 2 +- 4 files changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ac6e9bc6cc59..d859a3601ddd 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -753,8 +753,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsreq, - struct timespec __user *rmtp) + struct timespec64 *tsreq) { enum alarmtimer_type type = clock2alarm(which_clock); struct restart_block *restart = ¤t->restart_block; @@ -762,11 +761,6 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, ktime_t exp; int ret = 0; - if (flags & TIMER_ABSTIME) - rmtp = NULL; - - restart->nanosleep.rmtp = rmtp; - if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 239fff980fd0..ec6258c9cde5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1328,16 +1328,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, struct timespec __user *rmtp) + struct timespec64 *rqtp) { struct restart_block *restart_block = ¤t->restart_block; int error; - if (flags & TIMER_ABSTIME) - rmtp = NULL; - - restart_block->nanosleep.rmtp = rmtp; - /* * Diagnose required errors first. */ @@ -1388,10 +1383,9 @@ static int process_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, - struct timespec __user *rmtp) + struct timespec64 *rqtp) { - return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); } static long process_cpu_nsleep_restart(struct restart_block *restart_block) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1a9f59f8afc2..a3e5c01b430e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1041,11 +1041,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, * nanosleep for monotonic and realtime clocks */ static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsave, struct timespec __user *rmtp) + struct timespec64 *tsave) { - if (flags & TIMER_ABSTIME) - rmtp = NULL; - current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); @@ -1070,8 +1067,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, t64 = timespec_to_timespec64(t); if (!timespec64_valid(&t64)) return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64, rmtp); + return kc->nsleep(which_clock, flags, &t64); } /* diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index b086f5ba2f5b..bfd9e15c6ce0 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -10,7 +10,7 @@ struct k_clock { int (*clock_adj)(const clockid_t which_clock, struct timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, - struct timespec64 *, struct timespec __user *); + struct timespec64 *); long (*nsleep_restart)(struct restart_block *restart_block); int (*timer_set)(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, -- cgit v1.2.3 From edbeda46322fbcb15af2d2d0f2daffb0cd349a5a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:31 +0100 Subject: time/posix-timers: Move the compat copyouts to the nanosleep implementations Turn restart_block.nanosleep.{rmtp,compat_rmtp} into a tagged union (kind = 1 -> native, kind = 2 -> compat, kind = 0 -> nothing) and make the places doing actual copyout handle compat as well as native (that will become a helper in the next commit). Result: compat wrappers, messing with reassignments, etc. are gone. [ tglx: Folded in a variant of Peter Zijlstras enum patch ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-6-viro@ZenIV.linux.org.uk --- kernel/compat.c | 131 ----------------------------------------- kernel/time/alarmtimer.c | 16 +++-- kernel/time/hrtimer.c | 42 +++++++++++-- kernel/time/posix-cpu-timers.c | 20 +++++-- kernel/time/posix-stubs.c | 55 +++++++++++++---- kernel/time/posix-timers.c | 32 +++++++--- 6 files changed, 129 insertions(+), 167 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index cc9ba9d29b47..23afa26f574b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -213,82 +213,6 @@ int compat_convert_timespec(struct timespec __user **kts, return 0; } -static long compat_nanosleep_restart(struct restart_block *restart) -{ - struct compat_timespec __user *rmtp; - struct timespec rmt; - mm_segment_t oldfs; - long ret; - - restart->nanosleep.rmtp = (struct timespec __user *) &rmt; - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = hrtimer_nanosleep_restart(restart); - set_fs(oldfs); - - if (ret == -ERESTART_RESTARTBLOCK) { - rmtp = restart->nanosleep.compat_rmtp; - - if (rmtp && compat_put_timespec(&rmt, rmtp)) - return -EFAULT; - } - - return ret; -} - -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) -{ - struct timespec tu, rmt; - struct timespec64 tu64; - mm_segment_t oldfs; - long ret; - - if (compat_get_timespec(&tu, rqtp)) - return -EFAULT; - - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) - return -EINVAL; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - current->restart_block.nanosleep.rmtp = - rmtp ? (struct timespec __user *)&rmt : NULL; - ret = hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); - set_fs(oldfs); - - /* - * hrtimer_nanosleep() can only return 0 or - * -ERESTART_RESTARTBLOCK here because: - * - * - we call it with HRTIMER_MODE_REL and therefor exclude the - * -ERESTARTNOHAND return path. - * - * - we supply the rmtp argument from the task stack (due to - * the necessary compat conversion. So the update cannot - * fail, which excludes the -EFAULT return path as well. If - * it fails nevertheless we have a bigger problem and wont - * reach this place anymore. - * - * - if the return value is 0, we do not have to update rmtp - * because there is no remaining time. - * - * We check for -ERESTART_RESTARTBLOCK nevertheless if the - * core implementation decides to return random nonsense. - */ - if (ret == -ERESTART_RESTARTBLOCK) { - struct restart_block *restart = ¤t->restart_block; - - restart->fn = compat_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - - if (rmtp && compat_put_timespec(&rmt, rmtp)) - return -EFAULT; - } - return ret; -} - static inline long get_compat_itimerval(struct itimerval *o, struct compat_itimerval __user *i) { @@ -821,61 +745,6 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, return err; } -static long compat_clock_nanosleep_restart(struct restart_block *restart) -{ - long err; - mm_segment_t oldfs; - struct timespec tu; - struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp; - - restart->nanosleep.rmtp = (struct timespec __user *) &tu; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = clock_nanosleep_restart(restart); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - compat_put_timespec(&tu, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) -{ - long err; - mm_segment_t oldfs; - struct timespec in, out; - struct restart_block *restart; - - if (compat_get_timespec(&in, rqtp)) - return -EFAULT; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_nanosleep(which_clock, flags, - (struct timespec __user *) &in, - (struct timespec __user *) &out); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - compat_put_timespec(&out, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t->restart_block; - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index d859a3601ddd..57bcf94ee132 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -691,7 +692,7 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, enum alarmtimer_type type) { - struct timespec __user *rmtp; + struct restart_block *restart; alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); @@ -709,8 +710,8 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, if (freezing(current)) alarmtimer_freezerset(absexp, type); - rmtp = current->restart_block.nanosleep.rmtp; - if (rmtp) { + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { struct timespec rmt; ktime_t rem; @@ -720,7 +721,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, return 0; rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) +#ifdef CONFIG_COMPAT + if (restart->nanosleep.type == TT_COMPAT) { + if (compat_put_timespec(&rmt, + restart->nanosleep.compat_rmtp)) + return -EFAULT; + } else +#endif + if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt))) return -EFAULT; } return -ERESTART_RESTARTBLOCK; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index baa7b846b6e3..5370da8fc0a4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -1441,7 +1442,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { - struct timespec __user *rmtp; + struct restart_block *restart; + hrtimer_init_sleeper(t, current); do { @@ -1461,15 +1463,23 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod if (!t->task) return 0; - rmtp = current->restart_block.nanosleep.rmtp; - if (rmtp) { - struct timespec rmt; + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); + struct timespec rmt; + if (rem <= 0) return 0; rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) +#ifdef CONFIG_COMPAT + if (restart->nanosleep.type == TT_COMPAT) { + if (compat_put_timespec(&rmt, + restart->nanosleep.compat_rmtp)) + return -EFAULT; + } else +#endif + if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt))) return -EFAULT; } return -ERESTART_RESTARTBLOCK; @@ -1535,10 +1545,32 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, if (!timespec64_valid(&tu64)) return -EINVAL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) +{ + struct timespec64 tu64; + struct timespec tu; + + if (compat_get_timespec(&tu, rqtp)) + return -EFAULT; + + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) + return -EINVAL; + + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} +#endif + /* * Functions related to boot-time initialization: */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ec6258c9cde5..1563ca22cf1f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -1243,10 +1244,9 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_process = current; if (!error) { static struct itimerspec64 zero_it; - struct restart_block *restart = ¤t->restart_block; - struct timespec __user *rmtp; + struct restart_block *restart; - memset(&it, 0, sizeof it); + memset(&it, 0, sizeof(it)); it.it_value = *rqtp; spin_lock_irq(&timer.it_lock); @@ -1311,12 +1311,20 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * Report back to the user the time still remaining. */ - rmtp = restart->nanosleep.rmtp; - if (rmtp) { + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { struct timespec ts; ts = timespec64_to_timespec(it.it_value); - if (copy_to_user(rmtp, &ts, sizeof(*rmtp))) +#ifdef CONFIG_COMPAT + if (restart->nanosleep.type == TT_COMPAT) { + if (compat_put_timespec(&ts, + restart->nanosleep.compat_rmtp)) + return -EFAULT; + } else +#endif + if (copy_to_user(restart->nanosleep.rmtp, &ts, + sizeof(ts))) return -EFAULT; } restart->nanosleep.expires = timespec64_to_ns(rqtp); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 156a5e6f3bd2..749b76f2d757 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -17,6 +17,7 @@ #include #include #include +#include asmlinkage long sys_ni_posix_timers(void) { @@ -110,25 +111,53 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) - return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) - return -EINVAL; - if (flags & TIMER_ABSTIME) - rmtp = NULL; - current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? - HRTIMER_MODE_ABS : HRTIMER_MODE_REL, - which_clock); + break; default: return -EINVAL; } + + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } #ifdef CONFIG_COMPAT -long clock_nanosleep_restart(struct restart_block *restart_block) +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { - return hrtimer_nanosleep_restart(restart_block); + struct timespec64 t64; + struct timespec t; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + break; + default: + return -EINVAL; + } + + if (compat_get_timespec(&t, rqtp)) + return -EFAULT; + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } #endif diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index a3e5c01b430e..bec86b6b9814 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "timekeeping.h" #include "posix-timers.h" @@ -1069,25 +1070,40 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return kc->nsleep(which_clock, flags, &t64); } -/* - * This will restart clock_nanosleep. This is required only by - * compat_clock_nanosleep_restart for now. - */ -long clock_nanosleep_restart(struct restart_block *restart_block) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { - clockid_t which_clock = restart_block->nanosleep.clockid; const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t64; + struct timespec t; - if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + if (!kc) return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; + + if (compat_get_timespec(&t, rqtp)) + return -EFAULT; - return kc->nsleep_restart(restart_block); + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + + return kc->nsleep(which_clock, flags, &t64); } +#endif static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, -- cgit v1.2.3 From ce41aaf47af3d28c4c958e07675a3e0a51f09bd3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:32 +0100 Subject: hrtimers/posix-timers: Merge nanosleep timespec copyout logics into a new helper Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-7-viro@ZenIV.linux.org.uk --- kernel/time/alarmtimer.c | 10 +--------- kernel/time/hrtimer.c | 29 ++++++++++++++++++++--------- kernel/time/posix-cpu-timers.c | 13 ++----------- 3 files changed, 23 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57bcf94ee132..7bed4e44f9bd 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -721,15 +721,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, return 0; rmt = ktime_to_timespec(rem); -#ifdef CONFIG_COMPAT - if (restart->nanosleep.type == TT_COMPAT) { - if (compat_put_timespec(&rmt, - restart->nanosleep.compat_rmtp)) - return -EFAULT; - } else -#endif - if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt))) - return -EFAULT; + return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5370da8fc0a4..db2f5f7b4ba5 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1440,6 +1440,25 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); +int nanosleep_copyout(struct restart_block *restart, struct timespec *ts) +{ + switch(restart->nanosleep.type) { +#ifdef CONFIG_COMPAT + case TT_COMPAT: + if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp)) + return -EFAULT; + break; +#endif + case TT_NATIVE: + if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec))) + return -EFAULT; + break; + default: + BUG(); + } + return -ERESTART_RESTARTBLOCK; +} + static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; @@ -1472,15 +1491,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod return 0; rmt = ktime_to_timespec(rem); -#ifdef CONFIG_COMPAT - if (restart->nanosleep.type == TT_COMPAT) { - if (compat_put_timespec(&rmt, - restart->nanosleep.compat_rmtp)) - return -EFAULT; - } else -#endif - if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt))) - return -EFAULT; + return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1563ca22cf1f..993a924d1399 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1312,22 +1312,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; + restart->nanosleep.expires = timespec64_to_ns(rqtp); if (restart->nanosleep.type != TT_NONE) { struct timespec ts; ts = timespec64_to_timespec(it.it_value); -#ifdef CONFIG_COMPAT - if (restart->nanosleep.type == TT_COMPAT) { - if (compat_put_timespec(&ts, - restart->nanosleep.compat_rmtp)) - return -EFAULT; - } else -#endif - if (copy_to_user(restart->nanosleep.rmtp, &ts, - sizeof(ts))) - return -EFAULT; + error = nanosleep_copyout(restart, &ts); } - restart->nanosleep.expires = timespec64_to_ns(rqtp); } return error; -- cgit v1.2.3 From fb923c4a3c2ee735755d4a93522150fc35d0ecbd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:33 +0100 Subject: posix-timers: Kill ->nsleep_restart() No more users. Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-8-viro@ZenIV.linux.org.uk --- kernel/time/hrtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 6 ------ kernel/time/posix-timers.c | 4 ---- kernel/time/posix-timers.h | 1 - 4 files changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index db2f5f7b4ba5..45f83cc7c0c7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1496,7 +1496,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod return -ERESTART_RESTARTBLOCK; } -long __sched hrtimer_nanosleep_restart(struct restart_block *restart) +static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 993a924d1399..515148d4eeb1 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1386,10 +1386,6 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags, { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); } -static long process_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} static int thread_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { @@ -1412,7 +1408,6 @@ const struct k_clock clock_posix_cpu = { .clock_get = posix_cpu_clock_get, .timer_create = posix_cpu_timer_create, .nsleep = posix_cpu_nsleep, - .nsleep_restart = posix_cpu_nsleep_restart, .timer_set = posix_cpu_timer_set, .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, @@ -1424,7 +1419,6 @@ const struct k_clock clock_process = { .clock_get = process_cpu_clock_get, .timer_create = process_cpu_timer_create, .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, }; const struct k_clock clock_thread = { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index bec86b6b9814..ea4a463436bf 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1111,7 +1111,6 @@ static const struct k_clock clock_realtime = { .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1127,7 +1126,6 @@ static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1158,7 +1156,6 @@ static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_tai, .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1174,7 +1171,6 @@ static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_boottime, .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index bfd9e15c6ce0..5e69bb85629f 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -11,7 +11,6 @@ struct k_clock { int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, struct timespec64 *); - long (*nsleep_restart)(struct restart_block *restart_block); int (*timer_set)(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting); -- cgit v1.2.3 From 3a4d44b6162555070194e486ff6b3799a8d323a2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:34 +0100 Subject: ntp: Move adjtimex related compat syscalls to native counterparts Get rid of set_fs() mess and sanitize compat_{get,put}_timex(), while we are at it. Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-9-viro@ZenIV.linux.org.uk --- kernel/compat.c | 141 +++++++++++++++++---------------------------- kernel/time/posix-stubs.c | 2 + kernel/time/posix-timers.c | 27 +++++++++ kernel/time/time.c | 24 +++++++- 4 files changed, 104 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 23afa26f574b..97087b333543 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -30,60 +30,64 @@ #include -static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) { - memset(txc, 0, sizeof(struct timex)); - - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc->modes, &utp->modes) || - __get_user(txc->offset, &utp->offset) || - __get_user(txc->freq, &utp->freq) || - __get_user(txc->maxerror, &utp->maxerror) || - __get_user(txc->esterror, &utp->esterror) || - __get_user(txc->status, &utp->status) || - __get_user(txc->constant, &utp->constant) || - __get_user(txc->precision, &utp->precision) || - __get_user(txc->tolerance, &utp->tolerance) || - __get_user(txc->time.tv_sec, &utp->time.tv_sec) || - __get_user(txc->time.tv_usec, &utp->time.tv_usec) || - __get_user(txc->tick, &utp->tick) || - __get_user(txc->ppsfreq, &utp->ppsfreq) || - __get_user(txc->jitter, &utp->jitter) || - __get_user(txc->shift, &utp->shift) || - __get_user(txc->stabil, &utp->stabil) || - __get_user(txc->jitcnt, &utp->jitcnt) || - __get_user(txc->calcnt, &utp->calcnt) || - __get_user(txc->errcnt, &utp->errcnt) || - __get_user(txc->stbcnt, &utp->stbcnt)) + struct compat_timex tx32; + + if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) return -EFAULT; + txc->modes = tx32.modes; + txc->offset = tx32.offset; + txc->freq = tx32.freq; + txc->maxerror = tx32.maxerror; + txc->esterror = tx32.esterror; + txc->status = tx32.status; + txc->constant = tx32.constant; + txc->precision = tx32.precision; + txc->tolerance = tx32.tolerance; + txc->time.tv_sec = tx32.time.tv_sec; + txc->time.tv_usec = tx32.time.tv_usec; + txc->tick = tx32.tick; + txc->ppsfreq = tx32.ppsfreq; + txc->jitter = tx32.jitter; + txc->shift = tx32.shift; + txc->stabil = tx32.stabil; + txc->jitcnt = tx32.jitcnt; + txc->calcnt = tx32.calcnt; + txc->errcnt = tx32.errcnt; + txc->stbcnt = tx32.stbcnt; + return 0; } -static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) -{ - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc->modes, &utp->modes) || - __put_user(txc->offset, &utp->offset) || - __put_user(txc->freq, &utp->freq) || - __put_user(txc->maxerror, &utp->maxerror) || - __put_user(txc->esterror, &utp->esterror) || - __put_user(txc->status, &utp->status) || - __put_user(txc->constant, &utp->constant) || - __put_user(txc->precision, &utp->precision) || - __put_user(txc->tolerance, &utp->tolerance) || - __put_user(txc->time.tv_sec, &utp->time.tv_sec) || - __put_user(txc->time.tv_usec, &utp->time.tv_usec) || - __put_user(txc->tick, &utp->tick) || - __put_user(txc->ppsfreq, &utp->ppsfreq) || - __put_user(txc->jitter, &utp->jitter) || - __put_user(txc->shift, &utp->shift) || - __put_user(txc->stabil, &utp->stabil) || - __put_user(txc->jitcnt, &utp->jitcnt) || - __put_user(txc->calcnt, &utp->calcnt) || - __put_user(txc->errcnt, &utp->errcnt) || - __put_user(txc->stbcnt, &utp->stbcnt) || - __put_user(txc->tai, &utp->tai)) +int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) +{ + struct compat_timex tx32; + + memset(&tx32, 0, sizeof(struct compat_timex)); + tx32.modes = txc->modes; + tx32.offset = txc->offset; + tx32.freq = txc->freq; + tx32.maxerror = txc->maxerror; + tx32.esterror = txc->esterror; + tx32.status = txc->status; + tx32.constant = txc->constant; + tx32.precision = txc->precision; + tx32.tolerance = txc->tolerance; + tx32.time.tv_sec = txc->time.tv_sec; + tx32.time.tv_usec = txc->time.tv_usec; + tx32.tick = txc->tick; + tx32.ppsfreq = txc->ppsfreq; + tx32.jitter = txc->jitter; + tx32.shift = txc->shift; + tx32.stabil = txc->stabil; + tx32.jitcnt = txc->jitcnt; + tx32.calcnt = txc->calcnt; + tx32.errcnt = txc->errcnt; + tx32.stbcnt = txc->stbcnt; + tx32.tai = txc->tai; + if (copy_to_user(utp, &tx32, sizeof(struct compat_timex))) return -EFAULT; return 0; } @@ -705,29 +709,6 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return err; } -COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, - struct compat_timex __user *, utp) -{ - struct timex txc; - mm_segment_t oldfs; - int err, ret; - - err = compat_get_timex(&txc, utp); - if (err) - return err; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); - set_fs(oldfs); - - err = compat_put_timex(utp, &txc); - if (err) - return err; - - return ret; -} - COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { @@ -944,24 +925,6 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) -{ - struct timex txc; - int err, ret; - - err = compat_get_timex(&txc, utp); - if (err) - return err; - - ret = do_adjtimex(&txc); - - err = compat_put_timex(utp, &txc); - if (err) - return err; - - return ret; -} - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 749b76f2d757..954d1d8ff9e6 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -28,6 +28,7 @@ asmlinkage long sys_ni_posix_timers(void) } #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) +#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) SYS_NI(timer_create); SYS_NI(timer_gettime); @@ -40,6 +41,7 @@ SYS_NI(setitimer); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif +COMPAT_SYS_NI(clock_adjtime); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ea4a463436bf..b1b6d52d6425 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1018,6 +1018,33 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, return err; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, + struct compat_timex __user *, utp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + err = compat_get_timex(&ktx, utp); + if (err) + return err; + + err = kc->clock_adj(which_clock, &ktx); + + if (err >= 0) + err = compat_put_timex(utp, &ktx); + + return err; +} +#endif + SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 49c73c6ed648..400662f16c5a 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -39,6 +39,7 @@ #include #include +#include #include #include @@ -224,12 +225,33 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) * structure. But bear in mind that the structures * may change */ - if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + if (copy_from_user(&txc, txc_p, sizeof(struct timex))) return -EFAULT; ret = do_adjtimex(&txc); return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) +{ + struct timex txc; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + ret = do_adjtimex(&txc); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} +#endif + /* * Convert jiffies to milliseconds and back. * -- cgit v1.2.3 From 1acbe7708b0313b33287bb4ffcbf26462ea3c588 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:35 +0100 Subject: posix-timers: Take compat timer_settime(2) to native one ... and get rid of set_fs() in there Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-10-viro@ZenIV.linux.org.uk --- kernel/compat.c | 23 ------------ kernel/time/posix-stubs.c | 1 + kernel/time/posix-timers.c | 87 +++++++++++++++++++++++++++++++++------------- 3 files changed, 64 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 97087b333543..df39e2e00c47 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -635,29 +635,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct compat_itimerspec __user *, new, - struct compat_itimerspec __user *, old) -{ - long err; - mm_segment_t oldfs; - struct itimerspec newts, oldts; - - if (!new) - return -EINVAL; - if (get_compat_itimerspec(&newts, new)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_settime(timer_id, flags, - (struct itimerspec __user *) &newts, - (struct itimerspec __user *) &oldts); - set_fs(oldfs); - if (!err && old && put_compat_itimerspec(old, &oldts)) - return -EFAULT; - return err; -} - COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct compat_itimerspec __user *, setting) { diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 954d1d8ff9e6..ad263df132d6 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -42,6 +42,7 @@ SYS_NI(setitimer); SYS_NI(alarm); #endif COMPAT_SYS_NI(clock_adjtime); +COMPAT_SYS_NI(timer_settime); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b1b6d52d6425..a73feac191f9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -819,31 +819,21 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -/* Set a POSIX.1b interval timer */ -SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - const struct itimerspec __user *, new_setting, - struct itimerspec __user *, old_setting) +static int do_timer_settime(timer_t timer_id, int flags, + struct itimerspec64 *new_spec64, + struct itimerspec64 *old_spec64) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; - struct itimerspec new_spec, old_spec; + const struct k_clock *kc; struct k_itimer *timr; unsigned long flag; - const struct k_clock *kc; int error = 0; - if (!new_setting) + if (!timespec64_valid(&new_spec64->it_interval) || + !timespec64_valid(&new_spec64->it_value)) return -EINVAL; - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) - return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - - if (!timespec64_valid(&new_spec64.it_interval) || - !timespec64_valid(&new_spec64.it_value)) - return -EINVAL; - if (rtn) - memset(rtn, 0, sizeof(*rtn)); + if (old_spec64) + memset(old_spec64, 0, sizeof(*old_spec64)); retry: timr = lock_timer(timer_id, &flag); if (!timr) @@ -853,22 +843,71 @@ retry: if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else - error = kc->timer_set(timr, flags, &new_spec64, rtn); + error = kc->timer_set(timr, flags, new_spec64, old_spec64); unlock_timer(timr, flag); if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... + old_spec64 = NULL; // We already got the old time... goto retry; } - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (old_setting && !error && - copy_to_user(old_setting, &old_spec, sizeof (old_spec))) - error = -EFAULT; + return error; +} + +/* Set a POSIX.1b interval timer */ +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct itimerspec __user *, new_setting, + struct itimerspec __user *, old_setting) +{ + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; + struct itimerspec new_spec; + int error = 0; + + if (!new_setting) + return -EINVAL; + if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + return -EFAULT; + new_spec64 = itimerspec_to_itimerspec64(&new_spec); + + error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + if (!error && old_setting) { + struct itimerspec old_spec; + old_spec = itimerspec64_to_itimerspec(&old_spec64); + if (copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + error = -EFAULT; + } return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + struct compat_itimerspec __user *, new, + struct compat_itimerspec __user *, old) +{ + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old ? &old_spec64 : NULL; + struct itimerspec new_spec; + int error = 0; + + if (!new) + return -EINVAL; + if (get_compat_itimerspec(&new_spec, new)) + return -EFAULT; + + new_spec64 = itimerspec_to_itimerspec64(&new_spec); + error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + if (!error && old) { + struct itimerspec old_spec; + old_spec = itimerspec64_to_itimerspec(&old_spec64); + if (put_compat_itimerspec(old, &old_spec)) + error = -EFAULT; + } + return error; +} +#endif + int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; -- cgit v1.2.3 From b0dc12426ec404de99d7e75a12a22d9201d90914 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:36 +0100 Subject: posix-timers: Take compat timer_gettime(2) to native one ... and get rid of set_fs() in there Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-11-viro@ZenIV.linux.org.uk --- kernel/compat.c | 17 ----------------- kernel/time/posix-stubs.c | 1 + kernel/time/posix-timers.c | 43 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 35 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index df39e2e00c47..1fb8cf7e691e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -635,23 +635,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct compat_itimerspec __user *, setting) -{ - long err; - mm_segment_t oldfs; - struct itimerspec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_gettime(timer_id, - (struct itimerspec __user *) &ts); - set_fs(oldfs); - if (!err && put_compat_itimerspec(setting, &ts)) - return -EFAULT; - return err; -} - COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, struct compat_timespec __user *, tp) { diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index ad263df132d6..f4a1962d1729 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -43,6 +43,7 @@ SYS_NI(alarm); #endif COMPAT_SYS_NI(clock_adjtime); COMPAT_SYS_NI(timer_settime); +COMPAT_SYS_NI(timer_gettime); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index a73feac191f9..e82bb1fd614e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -690,11 +690,8 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) } /* Get the time remaining on a POSIX.1b interval timer. */ -SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct itimerspec __user *, setting) +static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - struct itimerspec64 cur_setting64; - struct itimerspec cur_setting; struct k_itimer *timr; const struct k_clock *kc; unsigned long flags; @@ -704,21 +701,49 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (!timr) return -EINVAL; - memset(&cur_setting64, 0, sizeof(cur_setting64)); + memset(setting, 0, sizeof(*setting)); kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else - kc->timer_get(timr, &cur_setting64); + kc->timer_get(timr, setting); unlock_timer(timr, flags); + return ret; +} - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) - return -EFAULT; +/* Get the time remaining on a POSIX.1b interval timer. */ +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting64; + int ret = do_timer_gettime(timer_id, &cur_setting64); + if (!ret) { + struct itimerspec cur_setting; + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); + if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + ret = -EFAULT; + } + return ret; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct compat_itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting64; + + int ret = do_timer_gettime(timer_id, &cur_setting64); + if (!ret) { + struct itimerspec cur_setting; + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); + if (put_compat_itimerspec(setting, &cur_setting)) + ret = -EFAULT; + } return ret; } +#endif /* * Get the number of overruns of a POSIX.1b interval timer. This is to -- cgit v1.2.3 From 54ad9c46c262ce4a603dc7887e37956896a0211d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:37 +0100 Subject: itimers: Move compat itimer syscalls to native ones get rid of set_fs(), sanitize compat copyin/copyout. Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-12-viro@ZenIV.linux.org.uk --- kernel/compat.c | 69 +++++++++++------------------------------------ kernel/time/itimer.c | 38 ++++++++++++++++++++++++++ kernel/time/posix-stubs.c | 2 ++ 3 files changed, 56 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 1fb8cf7e691e..c349417d2c40 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -217,65 +217,28 @@ int compat_convert_timespec(struct timespec __user **kts, return 0; } -static inline long get_compat_itimerval(struct itimerval *o, - struct compat_itimerval __user *i) +int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) { - return (!access_ok(VERIFY_READ, i, sizeof(*i)) || - (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | - __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | - __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | - __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); -} - -static inline long put_compat_itimerval(struct compat_itimerval __user *o, - struct itimerval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | - __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | - __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | - __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); -} - -asmlinkage long sys_ni_posix_timers(void); - -COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct compat_itimerval __user *, it) -{ - struct itimerval kit; - int error; - - if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) - return sys_ni_posix_timers(); + struct compat_itimerval v32; - error = do_getitimer(which, &kit); - if (!error && put_compat_itimerval(it, &kit)) - error = -EFAULT; - return error; + if (copy_from_user(&v32, i, sizeof(struct compat_itimerval))) + return -EFAULT; + o->it_interval.tv_sec = v32.it_interval.tv_sec; + o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_value.tv_sec = v32.it_value.tv_sec; + o->it_value.tv_usec = v32.it_value.tv_usec; + return 0; } -COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct compat_itimerval __user *, in, - struct compat_itimerval __user *, out) +int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i) { - struct itimerval kin, kout; - int error; + struct compat_itimerval v32; - if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) - return sys_ni_posix_timers(); - - if (in) { - if (get_compat_itimerval(&kin, in)) - return -EFAULT; - } else - memset(&kin, 0, sizeof(kin)); - - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) - return error; - if (put_compat_itimerval(out, &kout)) - return -EFAULT; - return 0; + v32.it_interval.tv_sec = i->it_interval.tv_sec; + v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_value.tv_sec = i->it_value.tv_sec; + v32.it_value.tv_usec = i->it_value.tv_usec; + return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; } static compat_clock_t clock_t_to_compat_clock_t(clock_t x) diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 087d6a1279b8..9dd7ff5e445a 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -116,6 +117,19 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct compat_itimerval __user *, it) +{ + struct itimerval kit; + int error = do_getitimer(which, &kit); + + if (!error && put_compat_itimerval(it, &kit)) + error = -EFAULT; + return error; +} +#endif + /* * The timer is automagically restarted, when interval != 0 @@ -294,3 +308,27 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, return -EFAULT; return 0; } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct compat_itimerval __user *, in, + struct compat_itimerval __user *, out) +{ + struct itimerval kin, kout; + int error; + + if (in) { + if (get_compat_itimerval(&kin, in)) + return -EFAULT; + } else { + memset(&kin, 0, sizeof(kin)); + } + + error = do_setitimer(which, &kin, out ? &kout : NULL); + if (error || !out) + return error; + if (put_compat_itimerval(out, &kout)) + return -EFAULT; + return 0; +} +#endif diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index f4a1962d1729..7f88517461e8 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -44,6 +44,8 @@ SYS_NI(alarm); COMPAT_SYS_NI(clock_adjtime); COMPAT_SYS_NI(timer_settime); COMPAT_SYS_NI(timer_gettime); +COMPAT_SYS_NI(getitimer); +COMPAT_SYS_NI(setitimer); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC -- cgit v1.2.3 From d822cdcce43f9d4dcddbf9c68f9537d542ccc3c3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:38 +0100 Subject: posix-timers: Move compat versions of clock_gettime/settime/getres Move them to the native implementations and get rid of the set_fs() hackery. Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-13-viro@ZenIV.linux.org.uk --- kernel/compat.c | 51 ----------------------------------- kernel/time/posix-stubs.c | 53 +++++++++++++++++++++++++++++++++++++ kernel/time/posix-timers.c | 66 +++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 115 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index c349417d2c40..582c38bfdf60 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -598,57 +598,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - if (compat_get_timespec(&ts, tp)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_settime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - return err; -} - -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_gettime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && compat_put_timespec(&ts, tp)) - return -EFAULT; - return err; -} - -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_getres(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && tp && compat_put_timespec(&ts, tp)) - return -EFAULT; - return err; -} - /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 7f88517461e8..a375c31cb352 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -137,6 +137,59 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, } #ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + struct timespec64 new_tp64; + struct timespec new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (compat_get_timespec(&new_tp, tp)) + return -EFAULT; + + new_tp64 = timespec_to_timespec64(new_tp); + return do_sys_settimeofday64(&new_tp64, NULL); +} + +COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct compat_timespec __user *,tp) +{ + struct timespec64 kernel_tp64; + struct timespec kernel_tp; + + switch (which_clock) { + case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; + case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; + case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; + default: return -EINVAL; + } + + kernel_tp = timespec64_to_timespec(kernel_tp64); + if (compat_put_timespec(&kernel_tp, tp)) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + struct timespec rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (compat_put_timespec(&rtn_tp, tp)) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e82bb1fd614e..61a5fb91a3c7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1082,8 +1082,66 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, return err; } +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 rtn_tp64; + struct timespec rtn_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_getres(which_clock, &rtn_tp64); + rtn_tp = timespec64_to_timespec(rtn_tp64); + + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + error = -EFAULT; + + return error; +} + #ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 new_tp64; + struct timespec new_tp; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (compat_get_timespec(&new_tp, tp)) + return -EFAULT; + + new_tp64 = timespec_to_timespec64(new_tp); + + return kc->clock_set(which_clock, &new_tp64); +} + +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 kernel_tp64; + struct timespec kernel_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_get(which_clock, &kernel_tp64); + kernel_tp = timespec64_to_timespec(kernel_tp64); + + if (!error && compat_put_timespec(&kernel_tp, tp)) + error = -EFAULT; + + return error; +} + COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, struct compat_timex __user *, utp) { @@ -1107,10 +1165,9 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, return err; } -#endif -SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, - struct timespec __user *, tp) +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, + struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp64; @@ -1123,11 +1180,12 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, error = kc->clock_getres(which_clock, &rtn_tp64); rtn_tp = timespec64_to_timespec(rtn_tp64); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + if (!error && tp && compat_put_timespec(&rtn_tp, tp)) error = -EFAULT; return error; } +#endif /* * nanosleep for monotonic and realtime clocks -- cgit v1.2.3 From 2482097c6c0f01ad74c9b2cff120a519ac59846e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:39 +0100 Subject: posix-timers: Move compat_timer_create() to native, get rid of set_fs() Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-14-viro@ZenIV.linux.org.uk --- kernel/compat.c | 18 -------------- kernel/time/posix-stubs.c | 1 + kernel/time/posix-timers.c | 59 ++++++++++++++++++++++++++++++++-------------- 3 files changed, 42 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 582c38bfdf60..4544eb63edfa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -580,24 +580,6 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, return 0; } -COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, - struct compat_sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) -{ - struct sigevent __user *event = NULL; - - if (timer_event_spec) { - struct sigevent kevent; - - event = compat_alloc_user_space(sizeof(*event)); - if (get_compat_sigevent(&kevent, timer_event_spec) || - copy_to_user(event, &kevent, sizeof(*event))) - return -EFAULT; - } - - return sys_timer_create(which_clock, event, created_timer_id); -} - /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index a375c31cb352..38f3b20efa29 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -41,6 +41,7 @@ SYS_NI(setitimer); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif +COMPAT_SYS_NI(timer_create); COMPAT_SYS_NI(clock_adjtime); COMPAT_SYS_NI(timer_settime); COMPAT_SYS_NI(timer_gettime); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 61a5fb91a3c7..c9f45a84fb8b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -490,15 +490,12 @@ static int common_timer_create(struct k_itimer *new_timer) } /* Create a POSIX.1b interval timer. */ - -SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, - struct sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) +static int do_timer_create(clockid_t which_clock, struct sigevent *event, + timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; - sigevent_t event; int it_id_set = IT_ID_NOT_SET; if (!kc) @@ -523,29 +520,25 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->kclock = kc; new_timer->it_overrun = -1; - if (timer_event_spec) { - if (copy_from_user(&event, timer_event_spec, sizeof (event))) { - error = -EFAULT; - goto out; - } + if (event) { rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(&event)); + new_timer->it_pid = get_pid(good_sigevent(event)); rcu_read_unlock(); if (!new_timer->it_pid) { error = -EINVAL; goto out; } + new_timer->it_sigev_notify = event->sigev_notify; + new_timer->sigq->info.si_signo = event->sigev_signo; + new_timer->sigq->info.si_value = event->sigev_value; } else { - memset(&event.sigev_value, 0, sizeof(event.sigev_value)); - event.sigev_notify = SIGEV_SIGNAL; - event.sigev_signo = SIGALRM; - event.sigev_value.sival_int = new_timer->it_id; + new_timer->it_sigev_notify = SIGEV_SIGNAL; + new_timer->sigq->info.si_signo = SIGALRM; + memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); + new_timer->sigq->info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->sigq->info.si_signo = event.sigev_signo; - new_timer->sigq->info.si_value = event.sigev_value; new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; @@ -576,6 +569,36 @@ out: return error; } +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (copy_from_user(&event, timer_event_spec, sizeof (event))) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, + struct compat_sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (get_compat_sigevent(&event, timer_event_spec)) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} +#endif + /* * Locking issues: We need to protect the result of the id look up until * we get the timer locked down so it is not deleted under us. The -- cgit v1.2.3 From b180db2c8ca6692a10b79631cadc18d03303d494 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:40 +0100 Subject: time: Move compat_time()/stime() to native Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-15-viro@ZenIV.linux.org.uk --- kernel/compat.c | 40 ---------------------------------------- kernel/time/time.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 4544eb63edfa..aa7b9a27f9e7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -739,46 +739,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return ret; } -#ifdef __ARCH_WANT_COMPAT_SYS_TIME - -/* compat_time_t is a 32 bit "long" and needs to get converted. */ - -COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) -{ - compat_time_t i; - struct timeval tv; - - do_gettimeofday(&tv); - i = tv.tv_sec; - - if (tloc) { - if (put_user(i,tloc)) - return -EFAULT; - } - force_successful_syscall_return(); - return i; -} - -COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, diff --git a/kernel/time/time.c b/kernel/time/time.c index 400662f16c5a..e5d44999ff78 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -100,6 +100,47 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME */ +#ifdef CONFIG_COMPAT +#ifdef __ARCH_WANT_COMPAT_SYS_TIME + +/* compat_time_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +{ + struct timeval tv; + compat_time_t i; + + do_gettimeofday(&tv); + i = tv.tv_sec; + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ +#endif + SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { -- cgit v1.2.3 From 2b2d02856b3176701c91d723356f766d6ee27853 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:41 +0100 Subject: time: Move compat_gettimeofday()/settimeofday() to native Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-16-viro@ZenIV.linux.org.uk --- kernel/compat.c | 38 -------------------------------------- kernel/time/time.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index aa7b9a27f9e7..ebd8bdc3fd68 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -92,44 +92,6 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) return 0; } -COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, - struct timezone __user *, tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (compat_put_timeval(&ktv, tv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - - return 0; -} - -COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, - struct timezone __user *, tz) -{ - struct timespec64 new_ts; - struct timeval user_tv; - struct timezone new_tz; - - if (tv) { - if (compat_get_timeval(&user_tv, tv)) - return -EFAULT; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&new_tz, tz, sizeof(*tz))) - return -EFAULT; - } - - return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); -} - static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || diff --git a/kernel/time/time.c b/kernel/time/time.c index e5d44999ff78..7c89e437c4d7 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -257,6 +257,47 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) +{ + if (tv) { + struct timeval ktv; + + do_gettimeofday(&ktv); + if (compat_put_timeval(&ktv, tv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timespec64 new_ts; + struct timeval user_tv; + struct timezone new_tz; + + if (tv) { + if (compat_get_timeval(&user_tv, tv)) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} +#endif + SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) { struct timex txc; /* Local copy of parameter */ -- cgit v1.2.3 From 343d8fc208e43e50525ae6e0fc4845b9966b7318 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jun 2017 23:29:14 +0200 Subject: posix-cpu-timers: Avoid timespec conversion in do_cpu_nanosleep() No point in converting the expiry time back and forth. No point either to update the value in the caller supplied variable. mark the rqtp argument const. Signed-off-by: Thomas Gleixner Cc: Al Viro Cc: John Stultz Cc: Peter Zijlstra --- kernel/time/posix-cpu-timers.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 515148d4eeb1..3adfa42ca24c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1227,10 +1227,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp) + const struct timespec64 *rqtp) { - struct k_itimer timer; struct itimerspec64 it; + struct k_itimer timer; + u64 expires; int error; /* @@ -1279,7 +1280,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * We were interrupted by a signal. */ - *rqtp = ns_to_timespec64(timer.it.cpu.expires); + expires = timer.it.cpu.expires; error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { /* @@ -1312,7 +1313,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = timespec64_to_ns(rqtp); + restart->nanosleep.expires = expires; if (restart->nanosleep.type != TT_NONE) { struct timespec ts; -- cgit v1.2.3 From 938e7cf2d569833a5acf689a8926faf507826253 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jun 2017 23:34:33 +0200 Subject: posix-timers: Make nanosleep timespec argument const No nanosleep implementation modifies the rqtp argument. Mark is const. Signed-off-by: Thomas Gleixner Cc: Al Viro Cc: John Stultz Cc: Peter Zijlstra --- kernel/time/alarmtimer.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 4 ++-- kernel/time/posix-timers.c | 4 ++-- kernel/time/posix-timers.h | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 7bed4e44f9bd..c991cf212c6d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -753,7 +753,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsreq) + const struct timespec64 *tsreq) { enum alarmtimer_type type = clock2alarm(which_clock); struct restart_block *restart = ¤t->restart_block; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 45f83cc7c0c7..81da124f1115 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1510,7 +1510,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(struct timespec64 *rqtp, +long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3adfa42ca24c..9df618ee64cf 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1328,7 +1328,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp) + const struct timespec64 *rqtp) { struct restart_block *restart_block = ¤t->restart_block; int error; @@ -1383,7 +1383,7 @@ static int process_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp) + const struct timespec64 *rqtp) { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index c9f45a84fb8b..82d67be7d9d1 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1214,9 +1214,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, * nanosleep for monotonic and realtime clocks */ static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsave) + const struct timespec64 *rqtp) { - return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 5e69bb85629f..fb303c3be4d3 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -10,7 +10,7 @@ struct k_clock { int (*clock_adj)(const clockid_t which_clock, struct timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, - struct timespec64 *); + const struct timespec64 *); int (*timer_set)(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting); -- cgit v1.2.3 From 31fd85816dbe3a714bcc3f67c17c3dd87011f79e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Jun 2017 15:52:13 -0700 Subject: bpf: permits narrower load from bpf program context fields Currently, verifier will reject a program if it contains an narrower load from the bpf context structure. For example, __u8 h = __sk_buff->hash, or __u16 p = __sk_buff->protocol __u32 sample_period = bpf_perf_event_data->sample_period which are narrower loads of 4-byte or 8-byte field. This patch solves the issue by: . Introduce a new parameter ctx_field_size to carry the field size of narrower load from prog type specific *__is_valid_access validator back to verifier. . The non-zero ctx_field_size for a memory access indicates (1). underlying prog type specific convert_ctx_accesses supporting non-whole-field access (2). the current insn is a narrower or whole field access. . In verifier, for such loads where load memory size is less than ctx_field_size, verifier transforms it to a full field load followed by proper masking. . Currently, __sk_buff and bpf_perf_event_data->sample_period are supporting narrowing loads. . Narrower stores are still not allowed as typical ctx stores are just normal stores. Because of this change, some tests in verifier will fail and these tests are removed. As a bonus, rename some out of bound __sk_buff->cb access to proper field name and remove two redundant "skb cb oob" tests. Acked-by: Daniel Borkmann Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 71 +++++++++++++++++++++++++++++++++++------------- kernel/trace/bpf_trace.c | 21 ++++++++++---- 2 files changed, 67 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 519a6144d3d3..44b97d958fb7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -758,15 +758,26 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, } /* check access to 'struct bpf_context' fields */ -static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { + int ctx_field_size = 0; + /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) return 0; if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { + env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) { + /* a non zero ctx_field_size indicates: + * . For this field, the prog type specific ctx conversion algorithm + * only supports whole field access. + * . This ctx access is a candiate for later verifier transformation + * to load the whole field and then apply a mask to get correct result. + */ + if (ctx_field_size) + env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size; + /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -868,7 +879,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { @@ -911,7 +922,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - err = check_ctx_access(env, off, size, t, ®_type); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value_and_range(state->regs, value_regno); @@ -972,7 +983,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; int err; @@ -994,13 +1005,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; /* check whether atomic_add can read the memory */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); if (err) return err; /* check whether atomic_add can write into the same memory */ - return check_mem_access(env, insn->dst_reg, insn->off, + return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); } @@ -1416,7 +1427,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); if (err) return err; } @@ -2993,18 +3004,12 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn->src_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg); if (err) return err; - if (BPF_SIZE(insn->code) != BPF_W && - BPF_SIZE(insn->code) != BPF_DW) { - insn_idx++; - continue; - } - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { @@ -3032,7 +3037,7 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn); + err = check_xadd(env, insn_idx, insn); if (err) return err; insn_idx++; @@ -3051,7 +3056,7 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg); if (err) @@ -3080,7 +3085,7 @@ static int do_check(struct bpf_verifier_env *env) return err; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); if (err) @@ -3383,7 +3388,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, delta = 0; + int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3423,11 +3428,39 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; + off = insn->off; + size = bpf_size_to_bytes(BPF_SIZE(insn->code)); + ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; + is_narrower_load = (type == BPF_READ && size < ctx_field_size); + + /* If the read access is a narrower load of the field, + * convert to a 4/8-byte load, to minimum program type specific + * convert_ctx_access changes. If conversion is successful, + * we will apply proper mask to the result. + */ + if (is_narrower_load) { + int size_code = BPF_H; + + if (ctx_field_size == 4) + size_code = BPF_W; + else if (ctx_field_size == 8) + size_code = BPF_DW; + insn->off = off & ~(ctx_field_size - 1); + insn->code = BPF_LDX | BPF_MEM | size_code; + } cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } + if (is_narrower_load) { + if (ctx_field_size <= 4) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + else + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 051d7fca0c09..9d3ec8253131 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -479,7 +479,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; @@ -562,7 +562,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; @@ -581,17 +581,26 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { + int sample_period_off; + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; - if (off == offsetof(struct bpf_perf_event_data, sample_period)) { - if (size != sizeof(u64)) - return false; + + /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ + sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); + if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { + *ctx_field_size = 8; +#ifdef __LITTLE_ENDIAN + return (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; +#else + return ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; +#endif } else { if (size != sizeof(long)) return false; -- cgit v1.2.3 From 73a7242a06ff995d771fbe243e72b516feaa6e3d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:01 -0400 Subject: cgroup: Keep accurate count of tasks in each css_set The reference count in the css_set data structure was used as a proxy of the number of tasks attached to that css_set. However, that count is actually not an accurate measure especially with thread mode support. So a new variable nr_tasks is added to the css_set to keep track of the actual task count. This new variable is protected by the css_set_lock. Functions that require the actual task count are updated to use the new variable. tj: s/task_count/nr_tasks/ for consistency with cgroup_root->nr_cgrps. Refreshed on top of cgroup/for-v4.13 which dropped on css_set_populated() -> nr_tasks conversion. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 +----- kernel/cgroup/cgroup.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 85d75152402d..e9ea5f201fac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -334,10 +334,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. The returned number can be - * higher than the actual number of tasks due to css_set references from - * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { @@ -346,7 +342,7 @@ static int cgroup_task_count(const struct cgroup *cgrp) spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += refcount_read(&link->cset->refcount); + count += link->cset->nr_tasks; spin_unlock_irq(&css_set_lock); return count; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8d4e85eae42c..dbfd7028b1c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */ /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set + * + * css_set_populated() should be the same as !!cset->nr_tasks at steady + * state. However, css_set_populated() can be called while a task is being + * added to or removed from the linked list before the nr_tasks is + * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { @@ -1598,6 +1603,7 @@ static void cgroup_enable_task_cg_lists(void) css_set_update_populated(cset, true); list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); + cset->nr_tasks++; } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); @@ -2064,8 +2070,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); + to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); put_css_set_locked(from_cset); + from_cset->nr_tasks--; } } spin_unlock_irq(&css_set_lock); @@ -4789,6 +4797,7 @@ void cgroup_post_fork(struct task_struct *child) cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); + cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } spin_unlock_irq(&css_set_lock); @@ -4838,6 +4847,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + cset->nr_tasks--; spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); -- cgit v1.2.3 From a28f8f5e995fe5964ae304444913536058f26e37 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:02 -0400 Subject: cgroup: Move debug cgroup to its own file The debug cgroup currently resides within cgroup-v1.c and is enabled only for v1 cgroup. To enable the debug cgroup also for v2, it makes sense to put the code into its own file as it will no longer be v1 specific. There is no change to the debug cgroup specific code. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/Makefile | 1 + kernel/cgroup/cgroup-internal.h | 2 + kernel/cgroup/cgroup-v1.c | 149 +------------------------------------- kernel/cgroup/debug.c | 153 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+), 148 deletions(-) create mode 100644 kernel/cgroup/debug.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 387348a40c64..ce693ccb8c58 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 00f4d6bf048f..793565c05742 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int cgroup_task_count(const struct cgroup *cgrp); + /* * namespace.c */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e9ea5f201fac..7bf4b1533f34 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -335,7 +335,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question */ -static int cgroup_task_count(const struct cgroup *cgrp) +int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; @@ -1259,150 +1259,3 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); - - -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = refcount_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - - seq_printf(seq, "css_set %pK\n", cset); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - continue; - overflow: - seq_puts(seq, " ...\n"); - } - spin_unlock_irq(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, -}; -#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c new file mode 100644 index 000000000000..1c209aa43733 --- /dev/null +++ b/kernel/cgroup/debug.c @@ -0,0 +1,153 @@ +#include +#include +#include + +#include "cgroup-internal.h" + +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +/* + * debug_taskcount_read - return the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} + +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = refcount_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) +{ + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; + + seq_printf(seq, "css_set %pK\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); +} + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; -- cgit v1.2.3 From 23b0be480f341db26ce0dee7d3f6e67f8e0e166f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:03 -0400 Subject: cgroup: Make Kconfig prompt of debug cgroup more accurate The Kconfig prompt and description of the debug cgroup controller more accurate by saying that it is for debug purpose only and its interfaces are unstable. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 1c209aa43733..cbe77a2087c5 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -1,3 +1,9 @@ +/* + * Debug controller + * + * WARNING: This controller is for cgroup core debugging only. + * Its interfaces are unstable and subject to changes at any time. + */ #include #include #include -- cgit v1.2.3 From 575313f40ff33d0c2aff2701dfb2ccfcd6211d55 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:04 -0400 Subject: cgroup: Make debug cgroup support v2 and thread mode Besides supporting cgroup v2 and thread mode, the following changes are also made: 1) current_* cgroup files now resides only at the root as we don't need duplicated files of the same function all over the cgroup hierarchy. 2) The cgroup_css_links_read() function is modified to report the number of tasks that are skipped because of overflow. 3) The number of extra unaccounted references are displayed. 4) The current_css_set_read() function now prints out the addresses of the css'es associated with the current css_set. 5) A new cgroup_subsys_states file is added to display the css objects associated with a cgroup. 6) A new cgroup_masks file is added to display the various controller bit masks in the cgroup. tj: Dropped thread mode related information for now so that debug controller changes aren't blocked on the thread mode. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 170 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 153 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index cbe77a2087c5..057d9b07f461 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -36,10 +36,37 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css, return cgroup_task_count(css->cgroup); } -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) +static int current_css_set_read(struct seq_file *seq, void *v) { - return (u64)(unsigned long)current->cgroups; + struct css_set *cset; + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; + int i, refcnt; + + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + refcnt = refcount_read(&cset->refcount); + seq_printf(seq, "css_set %pK %d", cset, refcnt); + if (refcnt > cset->nr_tasks) + seq_printf(seq, " +%d", refcnt - cset->nr_tasks); + seq_puts(seq, "\n"); + + /* + * Print the css'es stored in the current css_set. + */ + for_each_subsys(ss, i) { + css = cset->subsys[ss->id]; + if (!css) + continue; + seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, + (unsigned long)css, css->id); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + return 0; } static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, @@ -84,31 +111,126 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) { struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; + int dead_cnt = 0, extra_refs = 0; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; + int refcnt = refcount_read(&cset->refcount); - seq_printf(seq, "css_set %pK\n", cset); + seq_printf(seq, " %d", refcnt); + if (refcnt - cset->nr_tasks > 0) { + int extra = refcnt - cset->nr_tasks; + + seq_printf(seq, " +%d", extra); + /* + * Take out the one additional reference in + * init_css_set. + */ + if (cset == &init_css_set) + extra--; + extra_refs += extra; + } + seq_puts(seq, "\n"); list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); + if (count++ <= MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); } list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); + if (count++ <= MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); } - continue; - overflow: - seq_puts(seq, " ...\n"); + /* show # of overflowed tasks */ + if (count > MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " ... (%d)\n", + count - MAX_TASKS_SHOWN_PER_CSS); + + if (cset->dead) { + seq_puts(seq, " [dead]\n"); + dead_cnt++; + } + + WARN_ON(count != cset->nr_tasks); } spin_unlock_irq(&css_set_lock); + + if (!dead_cnt && !extra_refs) + return 0; + + seq_puts(seq, "\n"); + if (extra_refs) + seq_printf(seq, "extra references = %d\n", extra_refs); + if (dead_cnt) + seq_printf(seq, "dead css_sets = %d\n", dead_cnt); + + return 0; +} + +static int cgroup_subsys_states_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; + char pbuf[16]; + int i; + + mutex_lock(&cgroup_mutex); + for_each_subsys(ss, i) { + css = rcu_dereference_check(cgrp->subsys[ss->id], true); + if (!css) + continue; + + pbuf[0] = '\0'; + + /* Show the parent CSS if applicable*/ + if (css->parent) + snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", + css->parent->id); + seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, + (unsigned long)css, css->id, + atomic_read(&css->online_cnt), pbuf); + } + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroup_masks_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct cgroup_subsys *ss; + int i, j; + struct { + u16 *mask; + char *name; + } mask_list[] = { + { &cgrp->subtree_control, "subtree_control" }, + { &cgrp->subtree_ss_mask, "subtree_ss_mask" }, + }; + + mutex_lock(&cgroup_mutex); + for (i = 0; i < ARRAY_SIZE(mask_list); i++) { + u16 mask = *mask_list[i].mask; + bool first = true; + + seq_printf(seq, "%-17s: ", mask_list[i].name); + for_each_subsys(ss, j) { + if (!(mask & (1 << ss->id))) + continue; + if (!first) + seq_puts(seq, ", "); + seq_puts(seq, ss->name); + first = false; + } + seq_putc(seq, '\n'); + } + + mutex_unlock(&cgroup_mutex); return 0; } @@ -126,17 +248,20 @@ static struct cftype debug_files[] = { { .name = "current_css_set", - .read_u64 = current_css_set_read, + .seq_show = current_css_set_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { .name = "current_css_set_refcount", .read_u64 = current_css_set_refcount_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { .name = "current_css_set_cg_links", .seq_show = current_css_set_cg_links_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { @@ -144,6 +269,16 @@ static struct cftype debug_files[] = { .seq_show = cgroup_css_links_read, }, + { + .name = "cgroup_subsys_states", + .seq_show = cgroup_subsys_states_read, + }, + + { + .name = "cgroup_masks", + .seq_show = cgroup_masks_read, + }, + { .name = "releasable", .read_u64 = releasable_read, @@ -153,7 +288,8 @@ static struct cftype debug_files[] = { }; struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, + .dfl_cftypes = debug_files, }; -- cgit v1.2.3 From 8cc38fa7fa317d44710f24475576b1f9ee205da9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:32 -0400 Subject: cgroup: make debug an implicit controller on cgroup2 Make debug an implicit controller on cgroup2 which is enabled by "cgroup_debug" boot param. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 057d9b07f461..d61e692a5338 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -240,7 +240,7 @@ static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) !css_has_online_children(&css->cgroup->self)); } -static struct cftype debug_files[] = { +static struct cftype debug_legacy_files[] = { { .name = "taskcount", .read_u64 = debug_taskcount_read, @@ -287,9 +287,62 @@ static struct cftype debug_files[] = { { } /* terminate */ }; +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .seq_show = current_css_set_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "csses", + .seq_show = cgroup_subsys_states_read, + }, + + { + .name = "masks", + .seq_show = cgroup_masks_read, + }, + + { } /* terminate */ +}; + struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .legacy_cftypes = debug_files, - .dfl_cftypes = debug_files, + .legacy_cftypes = debug_legacy_files, }; + +/* + * On v2, debug is an implicit controller enabled by "cgroup_debug" boot + * parameter. + */ +static int __init enable_cgroup_debug(char *str) +{ + debug_cgrp_subsys.dfl_cftypes = debug_files; + debug_cgrp_subsys.implicit_on_dfl = true; + return 1; +} +__setup("cgroup_debug", enable_cgroup_debug); -- cgit v1.2.3 From 2866c0b4cf25274040f0b4cc045ad1f44b885dd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:36 -0400 Subject: cgroup: refactor cgroup_masks_read() in the debug controller Factor out cgroup_masks_read_one() out of cgroup_masks_read() for simplicity. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index d61e692a5338..163fdbd7adf6 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -200,36 +200,32 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) return 0; } -static int cgroup_masks_read(struct seq_file *seq, void *v) +static void cgroup_masks_read_one(struct seq_file *seq, const char *name, + u16 mask) { - struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss; - int i, j; - struct { - u16 *mask; - char *name; - } mask_list[] = { - { &cgrp->subtree_control, "subtree_control" }, - { &cgrp->subtree_ss_mask, "subtree_ss_mask" }, - }; + int ssid; + bool first = true; - mutex_lock(&cgroup_mutex); - for (i = 0; i < ARRAY_SIZE(mask_list); i++) { - u16 mask = *mask_list[i].mask; - bool first = true; - - seq_printf(seq, "%-17s: ", mask_list[i].name); - for_each_subsys(ss, j) { - if (!(mask & (1 << ss->id))) - continue; - if (!first) - seq_puts(seq, ", "); - seq_puts(seq, ss->name); - first = false; - } - seq_putc(seq, '\n'); + seq_printf(seq, "%-17s: ", name); + for_each_subsys(ss, ssid) { + if (!(mask & (1 << ssid))) + continue; + if (!first) + seq_puts(seq, ", "); + seq_puts(seq, ss->name); + first = false; } + seq_putc(seq, '\n'); +} +static int cgroup_masks_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + mutex_lock(&cgroup_mutex); + cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); + cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.2.3 From b6053d40e3387c916d56d319ba6bfdb2c42a67c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:41 -0400 Subject: cgroup: fix lockdep warning in debug controller The debug controller grabs cgroup_mutex from interface file show functions which can deadlock and triggers lockdep warnings. Fix it by using cgroup_kn_lock_live()/cgroup_kn_unlock() instead. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 163fdbd7adf6..dac46af22782 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -38,12 +38,15 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css, static int current_css_set_read(struct seq_file *seq, void *v) { + struct kernfs_open_file *of = seq->private; struct css_set *cset; struct cgroup_subsys *ss; struct cgroup_subsys_state *css; int i, refcnt; - mutex_lock(&cgroup_mutex); + if (!cgroup_kn_lock_live(of->kn, false)) + return -ENODEV; + spin_lock_irq(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); @@ -65,7 +68,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return 0; } @@ -174,13 +177,17 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) static int cgroup_subsys_states_read(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp; struct cgroup_subsys *ss; struct cgroup_subsys_state *css; char pbuf[16]; int i; - mutex_lock(&cgroup_mutex); + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + for_each_subsys(ss, i) { css = rcu_dereference_check(cgrp->subsys[ss->id], true); if (!css) @@ -196,7 +203,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) (unsigned long)css, css->id, atomic_read(&css->online_cnt), pbuf); } - mutex_unlock(&cgroup_mutex); + + cgroup_kn_unlock(of->kn); return 0; } @@ -221,12 +229,17 @@ static void cgroup_masks_read_one(struct seq_file *seq, const char *name, static int cgroup_masks_read(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; - mutex_lock(&cgroup_mutex); cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); - mutex_unlock(&cgroup_mutex); + + cgroup_kn_unlock(of->kn); return 0; } -- cgit v1.2.3 From 33e4f80ee69b5168badf37edbfed796eb48434b9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Jun 2017 22:56:34 +0200 Subject: ACPI / PM: Ignore spurious SCI wakeups from suspend-to-idle The ACPI SCI (System Control Interrupt) is set up as a wakeup IRQ during suspend-to-idle transitions and, consequently, any events signaled through it wake up the system from that state. However, on some systems some of the events signaled via the ACPI SCI while suspended to idle should not cause the system to wake up. In fact, quite often they should just be discarded. Arguably, systems should not resume entirely on such events, but in order to decide which events really should cause the system to resume and which are spurious, it is necessary to resume up to the point when ACPI SCIs are actually handled and processed, which is after executing dpm_resume_noirq() in the system resume path. For this reasons, add a loop around freeze_enter() in which the platforms can process events signaled via multiplexed IRQ lines like the ACPI SCI and add suspend-to-idle hooks that can be used for this purpose to struct platform_freeze_ops. In the ACPI case, the ->wake hook is used for checking if the SCI has triggered while suspended and deferring the interrupt-induced system wakeup until the events signaled through it are actually processed sufficiently to decide whether or not the system should resume. In turn, the ->sync hook allows all of the relevant event queues to be flushed so as to prevent events from being missed due to race conditions. In addition to that, some ACPI code processing wakeup events needs to be modified to use the "hard" version of wakeup triggers, so that it will cause a system resume to happen on device-induced wakeup events even if the "soft" mechanism to prevent the system from suspending is not enabled. However, to preserve the existing behavior with respect to suspend-to-RAM, this only is done in the suspend-to-idle case and only if an SCI has occurred while suspended. Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 2 +- kernel/power/suspend.c | 35 +++++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index c7209f060eeb..78672d324a6e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -132,7 +132,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(); + pm_wakeup_clear(true); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73..3ecf275d7e44 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -72,6 +72,8 @@ static void freeze_begin(void) static void freeze_enter(void) { + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); + spin_lock_irq(&suspend_freeze_lock); if (pm_wakeup_pending()) goto out; @@ -84,11 +86,9 @@ static void freeze_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); - pr_debug("PM: suspend-to-idle\n"); /* Make the current CPU wait so it can enter the idle loop too. */ wait_event(suspend_freeze_wait_head, suspend_freeze_state == FREEZE_STATE_WAKE); - pr_debug("PM: resume from suspend-to-idle\n"); cpuidle_pause(); put_online_cpus(); @@ -98,6 +98,31 @@ static void freeze_enter(void) out: suspend_freeze_state = FREEZE_STATE_NONE; spin_unlock_irq(&suspend_freeze_lock); + + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); +} + +static void s2idle_loop(void) +{ + pr_debug("PM: suspend-to-idle\n"); + + do { + freeze_enter(); + + if (freeze_ops && freeze_ops->wake) + freeze_ops->wake(); + + dpm_resume_noirq(PMSG_RESUME); + if (freeze_ops && freeze_ops->sync) + freeze_ops->sync(); + + if (pm_wakeup_pending()) + break; + + pm_wakeup_clear(false); + } while (!dpm_suspend_noirq(PMSG_SUSPEND)); + + pr_debug("PM: resume from suspend-to-idle\n"); } void freeze_wake(void) @@ -371,10 +396,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { - trace_suspend_resume(TPS("machine_suspend"), state, true); - freeze_enter(); - trace_suspend_resume(TPS("machine_suspend"), state, false); - goto Platform_wake; + s2idle_loop(); + goto Platform_early_resume; } error = disable_nonboot_cpus(); -- cgit v1.2.3 From cd33f5f2cbfaadc21270f3ddac7c3c33e0a1a28c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 12 Jun 2017 11:53:09 -0400 Subject: audit: make sure we never skip the multicast broadcast When the auditd connection is reset, either intentionally or due to a failure, any records that were in the main backlog queue would not be sent in a multicast broadcast. This patch fixes this problem by not flushing the main backlog queue on a connection reset, the main kauditd_thread() will take care of that normally. Resolves: https://github.com/linux-audit/audit-kernel/issues/41 Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e1e2b3abfb93..7cad70214b81 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -605,11 +605,10 @@ static void auditd_reset(const struct auditd_connection *ac) if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); - /* flush all of the main and retry queues to the hold queue */ + /* flush the retry queue to the hold queue, but don't touch the main + * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb); - while ((skb = skb_dequeue(&audit_queue))) - kauditd_hold_skb(skb); } /** -- cgit v1.2.3 From 57db7e4a2d92c2d3dfbca4ef8057849b2682436b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 13 Jun 2017 04:31:16 -0500 Subject: signal: Only reschedule timers on signals timers have sent Thomas Gleixner wrote: > The CRIU support added a 'feature' which allows a user space task to send > arbitrary (kernel) signals to itself. The changelog says: > > The kernel prevents sending of siginfo with positive si_code, because > these codes are reserved for kernel. I think we can allow a task to > send such a siginfo to itself. This operation should not be dangerous. > > Quite contrary to that claim, it turns out that it is outright dangerous > for signals with info->si_code == SI_TIMER. The following code sequence in > a user space task allows to crash the kernel: > > id = timer_create(CLOCK_XXX, ..... signo = SIGX); > timer_set(id, ....); > info->si_signo = SIGX; > info->si_code = SI_TIMER: > info->_sifields._timer._tid = id; > info->_sifields._timer._sys_private = 2; > rt_[tg]sigqueueinfo(..., SIGX, info); > sigemptyset(&sigset); > sigaddset(&sigset, SIGX); > rt_sigtimedwait(sigset, info); > > For timers based on CLOCK_PROCESS_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID this > results in a kernel crash because sigwait() dequeues the signal and the > dequeue code observes: > > info->si_code == SI_TIMER && info->_sifields._timer._sys_private != 0 > > which triggers the following callchain: > > do_schedule_next_timer() -> posix_cpu_timer_schedule() -> arm_timer() > > arm_timer() executes a list_add() on the timer, which is already armed via > the timer_set() syscall. That's a double list add which corrupts the posix > cpu timer list. As a consequence the kernel crashes on the next operation > touching the posix cpu timer list. > > Posix clocks which are internally implemented based on hrtimers are not > affected by this because hrtimer_start() can handle already armed timers > nicely, but it's a reliable way to trigger the WARN_ON() in > hrtimer_forward(), which complains about calling that function on an > already armed timer. This problem has existed since the posix timer code was merged into 2.5.63. A few releases earlier in 2.5.60 ptrace gained the ability to inject not just a signal (which linux has supported since 1.0) but the full siginfo of a signal. The core problem is that the code will reschedule in response to signals getting dequeued not just for signals the timers sent but for other signals that happen to a si_code of SI_TIMER. Avoid this confusion by testing to see if the queued signal was preallocated as all timer signals are preallocated, and so far only the timer code preallocates signals. Move the check for if a timer needs to be rescheduled up into collect_signal where the preallocation check must be performed, and pass the result back to dequeue_signal where the code reschedules timers. This makes it clear why the code cares about preallocated timers. Cc: stable@vger.kernel.org Reported-by: Thomas Gleixner History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Reference: 66dd34ad31e5 ("signal: allow to send any siginfo to itself") Reference: 1669ce53e2ff ("Add PTRACE_GETSIGINFO and PTRACE_SETSIGINFO") Fixes: db8b50ba75f2 ("[PATCH] POSIX clocks & timers") Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..45b4c1ffe14e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -510,7 +510,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) +static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, + bool *resched_timer) { struct sigqueue *q, *first = NULL; @@ -532,6 +533,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); + + *resched_timer = + (first->flags & SIGQUEUE_PREALLOC) && + (info->si_code == SI_TIMER) && + (info->si_sys_private); + __sigqueue_free(first); } else { /* @@ -548,12 +555,12 @@ still_pending: } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - siginfo_t *info) + siginfo_t *info, bool *resched_timer) { int sig = next_signal(pending, mask); if (sig) - collect_signal(sig, pending, info); + collect_signal(sig, pending, info, resched_timer); return sig; } @@ -565,15 +572,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, */ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { + bool resched_timer = false; int signr; /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info); + mask, info, &resched_timer); #ifdef CONFIG_POSIX_TIMERS /* * itimer signal ? @@ -621,7 +629,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) current->jobctl |= JOBCTL_STOP_DEQUEUED; } #ifdef CONFIG_POSIX_TIMERS - if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { + if (resched_timer) { /* * Release the siglock to ensure proper locking order * of timer locks outside of siglocks. Note, we leave -- cgit v1.2.3 From 204a2be30a7a8a8d12642f23f3fbdc8b9923b500 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Wed, 7 Jun 2017 00:11:44 +0200 Subject: m68k: Remove ptrace_signal_deliver This fixes debugger syscall restart interactions. A debugger that modifies the tracee's program counter is expected to set the orig_d0 pseudo register to -1, to disable a possible syscall restart. This removes the last user of the ptrace_signal_deliver hook in the ptrace signal handling, so remove that as well. Signed-off-by: Andreas Schwab Signed-off-by: Geert Uytterhoeven --- kernel/signal.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..f0a48e5b1f0b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2092,7 +2092,6 @@ static void do_jobctl_trap(void) static int ptrace_signal(int signr, siginfo_t *info) { - ptrace_signal_deliver(); /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will -- cgit v1.2.3 From ceea5e3771ed2378668455fa21861bead7504df5 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 8 Jun 2017 16:44:20 -0700 Subject: time: Fix clock->read(clock) race around clocksource changes In tests, which excercise switching of clocksources, a NULL pointer dereference can be observed on AMR64 platforms in the clocksource read() function: u64 clocksource_mmio_readl_down(struct clocksource *c) { return ~(u64)readl_relaxed(to_mmio_clksrc(c)->reg) & c->mask; } This is called from the core timekeeping code via: cycle_now = tkr->read(tkr->clock); tkr->read is the cached tkr->clock->read() function pointer. When the clocksource is changed then tkr->clock and tkr->read are updated sequentially. The code above results in a sequential load operation of tkr->read and tkr->clock as well. If the store to tkr->clock hits between the loads of tkr->read and tkr->clock, then the old read() function is called with the new clock pointer. As a consequence the read() function dereferences a different data structure and the resulting 'reg' pointer can point anywhere including NULL. This problem was introduced when the timekeeping code was switched over to use struct tk_read_base. Before that, it was theoretically possible as well when the compiler decided to reload clock in the code sequence: now = tk->clock->read(tk->clock); Add a helper function which avoids the issue by reading tk_read_base->clock once into a local variable clk and then issue the read function via clk->read(clk). This guarantees that the read() function always gets the proper clocksource pointer handed in. Since there is now no use for the tkr.read pointer, this patch also removes it, and to address stopping the fast timekeeper during suspend/resume, it introduces a dummy clocksource to use rather then just a dummy read function. Signed-off-by: John Stultz Acked-by: Ingo Molnar Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: stable Cc: Miroslav Lichvar Cc: Daniel Mentz Link: http://lkml.kernel.org/r/1496965462-20003-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 52 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9652bc57fd09..eff94cb8e89e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -118,6 +118,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->offs_boot = ktime_add(tk->offs_boot, delta); } +/* + * tk_clock_read - atomic clocksource read() helper + * + * This helper is necessary to use in the read paths because, while the + * seqlock ensures we don't return a bad value while structures are updated, + * it doesn't protect from potential crashes. There is the possibility that + * the tkr's clocksource may change between the read reference, and the + * clock reference passed to the read function. This can cause crashes if + * the wrong clocksource is passed to the wrong read function. + * This isn't necessary to use when holding the timekeeper_lock or doing + * a read of the fast-timekeeper tkrs (which is protected by its own locking + * and update logic). + */ +static inline u64 tk_clock_read(struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} + #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ @@ -175,7 +195,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) */ do { seq = read_seqcount_begin(&tk_core.seq); - now = tkr->read(tkr->clock); + now = tk_clock_read(tkr); last = tkr->cycle_last; mask = tkr->mask; max = tkr->clock->max_cycles; @@ -209,7 +229,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) u64 cycle_now, delta; /* read clocksource */ - cycle_now = tkr->read(tkr->clock); + cycle_now = tk_clock_read(tkr); /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); @@ -238,12 +258,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; - tk->tkr_mono.read = clock->read; tk->tkr_mono.mask = clock->mask; - tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; - tk->tkr_raw.read = clock->read; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; @@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) now += timekeeping_delta_to_ns(tkr, clocksource_delta( - tkr->read(tkr->clock), + tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); @@ -461,6 +479,10 @@ static u64 dummy_clock_read(struct clocksource *cs) return cycles_at_suspend; } +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. @@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk) struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - cycles_at_suspend = tkr->read(tkr->clock); - tkr_dummy.read = dummy_clock_read; + cycles_at_suspend = tk_clock_read(tkr); + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - tkr_dummy.read = dummy_clock_read; + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } @@ -649,11 +671,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - struct clocksource *clock = tk->tkr_mono.clock; u64 cycle_now, delta; u64 nsec; - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -929,8 +950,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) do { seq = read_seqcount_begin(&tk_core.seq); - - now = tk->tkr_mono.read(tk->tkr_mono.clock); + now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, @@ -1108,7 +1128,7 @@ int get_device_system_crosststamp(int (*get_time_fn) * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ - now = tk->tkr_mono.read(tk->tkr_mono.clock); + now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!cycle_between(interval_start, cycles, now)) { clock_was_set_seq = tk->clock_was_set_seq; @@ -1629,7 +1649,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->tkr_mono.cycle_last) { u64 nsec, cyc_delta; @@ -2030,7 +2050,7 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); #endif -- cgit v1.2.3 From 3d88d56c5873f6eebe23e05c3da701960146b801 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 8 Jun 2017 16:44:21 -0700 Subject: time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting Due to how the MONOTONIC_RAW accumulation logic was handled, there is the potential for a 1ns discontinuity when we do accumulations. This small discontinuity has for the most part gone un-noticed, but since ARM64 enabled CLOCK_MONOTONIC_RAW in their vDSO clock_gettime implementation, we've seen failures with the inconsistency-check test in kselftest. This patch addresses the issue by using the same sub-ns accumulation handling that CLOCK_MONOTONIC uses, which avoids the issue for in-kernel users. Since the ARM64 vDSO implementation has its own clock_gettime calculation logic, this patch reduces the frequency of errors, but failures are still seen. The ARM64 vDSO will need to be updated to include the sub-nanosecond xtime_nsec values in its calculation for this issue to be completely fixed. Signed-off-by: John Stultz Tested-by: Daniel Mentz Cc: Prarit Bhargava Cc: Kevin Brodsky Cc: Richard Cochran Cc: Stephen Boyd Cc: Will Deacon Cc: "stable #4 . 8+" Cc: Miroslav Lichvar Link: http://lkml.kernel.org/r/1496965462-20003-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index eff94cb8e89e..b602c48cb841 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -280,7 +280,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; - tk->raw_interval = (interval * clock->mult) >> clock->shift; + tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { @@ -1996,7 +1996,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; - u64 raw_nsecs; + u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) @@ -2011,14 +2011,15 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = (u64)tk->raw_interval << shift; - raw_nsecs += tk->raw_time.tv_nsec; - if (raw_nsecs >= NSEC_PER_SEC) { - u64 raw_secs = raw_nsecs; - raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - tk->raw_time.tv_sec += raw_secs; + tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; + tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; + snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { + tk->tkr_raw.xtime_nsec -= snsec_per_sec; + tk->raw_time.tv_sec++; } - tk->raw_time.tv_nsec = raw_nsecs; + tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; + tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; -- cgit v1.2.3 From 842c08846420baa619fe3cb8c9af538efdb89428 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 14 Jun 2017 10:54:52 +0200 Subject: livepatch: Fix stacking of patches with respect to RCU rcu_read_(un)lock(), list_*_rcu(), and synchronize_rcu() are used for a secure access and manipulation of the list of patches that modify the same function. In particular, it is the variable func_stack that is accessible from the ftrace handler via struct ftrace_ops and klp_ops. Of course, it synchronizes also some states of the patch on the top of the stack, e.g. func->transition in klp_ftrace_handler. At the same time, this mechanism guards also the manipulation of task->patch_state. It is modified according to the state of the transition and the state of the process. Now, all this works well as long as RCU works well. Sadly livepatching might get into some corner cases when this is not true. For example, RCU is not watching when rcu_read_lock() is taken in idle threads. It is because they might sleep and prevent reaching the grace period for too long. There are ways how to make RCU watching even in idle threads, see rcu_irq_enter(). But there is a small location inside RCU infrastructure when even this does not work. This small problematic location can be detected either before calling rcu_irq_enter() by rcu_irq_enter_disabled() or later by rcu_is_watching(). Sadly, there is no safe way how to handle it. Once we detect that RCU was not watching, we might see inconsistent state of the function stack and the related variables in klp_ftrace_handler(). Then we could do a wrong decision, use an incompatible implementation of the function and break the consistency of the system. We could warn but we could not avoid the damage. Fortunately, ftrace has similar problems and they seem to be solved well there. It uses a heavy weight implementation of some RCU operations. In particular, it replaces: + rcu_read_lock() with preempt_disable_notrace() + rcu_read_unlock() with preempt_enable_notrace() + synchronize_rcu() with schedule_on_each_cpu(sync_work) My understanding is that this is RCU implementation from a stone age. It meets the core RCU requirements but it is rather ineffective. Especially, it does not allow to batch or speed up the synchronize calls. On the other hand, it is very trivial. It allows to safely trace and/or livepatch even the RCU core infrastructure. And the effectiveness is a not a big issue because using ftrace or livepatches on productive systems is a rare operation. The safety is much more important than a negligible extra load. Note that the alternative implementation follows the RCU principles. Therefore, we could and actually must use list_*_rcu() variants when manipulating the func_stack. These functions allow to access the pointers in the right order and with the right barriers. But they do not use any other information that would be set only by rcu_read_lock(). Also note that there are actually two problems solved in ftrace: First, it cares about the consistency of RCU read sections. It is being solved the way as described and used in this patch. Second, ftrace needs to make sure that nobody is inside the dynamic trampoline when it is being freed. For this, it also calls synchronize_rcu_tasks() in preemptive kernel in ftrace_shutdown(). Livepatch has similar problem but it is solved by ftrace for free. klp_ftrace_handler() is a good guy and never sleeps. In addition, it is registered with FTRACE_OPS_FL_DYNAMIC. It causes that unregister_ftrace_function() calls: * schedule_on_each_cpu(ftrace_sync) - always * synchronize_rcu_tasks() - in preemptive kernel The effect is that nobody is neither inside the dynamic trampoline nor inside the ftrace handler after unregister_ftrace_function() returns. [jkosina@suse.cz: reformat changelog, fix comment] Signed-off-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/patch.c | 8 ++++++-- kernel/livepatch/transition.c | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index f8269036bf0b..52c4e907c14b 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -59,7 +59,11 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); - rcu_read_lock(); + /* + * A variant of synchronize_sched() is used to allow patching functions + * where RCU is not watching, see klp_synchronize_transition(). + */ + preempt_disable_notrace(); func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, stack_node); @@ -115,7 +119,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, klp_arch_set_pc(regs, (unsigned long)func->new_func); unlock: - rcu_read_unlock(); + preempt_enable_notrace(); } /* diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index adc0cc64aa4b..b004a1fb6032 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -48,6 +48,28 @@ static void klp_transition_work_fn(struct work_struct *work) } static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); +/* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + */ +static void klp_sync(struct work_struct *work) +{ +} + +/* + * We allow to patch also functions where RCU is not watching, + * e.g. before user_exit(). We can not rely on the RCU infrastructure + * to do the synchronization. Instead hard force the sched synchronization. + * + * This approach allows to use RCU functions for manipulating func_stack + * safely. + */ +static void klp_synchronize_transition(void) +{ + schedule_on_each_cpu(klp_sync); +} + /* * The transition to the target patch state is complete. Clean up the data * structures. @@ -73,7 +95,7 @@ static void klp_complete_transition(void) * func->transition gets cleared, the handler may choose a * removed function. */ - synchronize_rcu(); + klp_synchronize_transition(); } if (klp_transition_patch->immediate) @@ -92,7 +114,7 @@ static void klp_complete_transition(void) /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ if (klp_target_state == KLP_PATCHED) - synchronize_rcu(); + klp_synchronize_transition(); read_lock(&tasklist_lock); for_each_process_thread(g, task) { @@ -136,7 +158,11 @@ void klp_cancel_transition(void) */ void klp_update_patch_state(struct task_struct *task) { - rcu_read_lock(); + /* + * A variant of synchronize_sched() is used to allow patching functions + * where RCU is not watching, see klp_synchronize_transition(). + */ + preempt_disable_notrace(); /* * This test_and_clear_tsk_thread_flag() call also serves as a read @@ -153,7 +179,7 @@ void klp_update_patch_state(struct task_struct *task) if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING)) task->patch_state = READ_ONCE(klp_target_state); - rcu_read_unlock(); + preempt_enable_notrace(); } /* @@ -539,7 +565,7 @@ void klp_reverse_transition(void) clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); /* Let any remaining calls to klp_update_patch_state() complete */ - synchronize_rcu(); + klp_synchronize_transition(); klp_start_transition(); } -- cgit v1.2.3 From cde50a67397c0da7d11795d4b4418384022ab8e6 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Sun, 18 Jun 2017 14:06:01 +0000 Subject: locking/rtmutex: Don't initialize lockdep when not required pi_mutex isn't supposed to be tracked by lockdep, but just passing NULLs for name and key will cause lockdep to spew a warning and die, which is not what we want it to do. Skip lockdep initialization if the caller passed NULLs for name and key, suggesting such initialization isn't desired. Signed-off-by: Sasha Levin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f5694788ad8d ("rt_mutex: Add lockdep annotations") Link: http://lkml.kernel.org/r/20170618140548.4763-1-alexander.levin@verizon.com Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 43123533e9b1..78069895032a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1661,7 +1661,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, lock->waiters = RB_ROOT; lock->waiters_leftmost = NULL; - debug_rt_mutex_init(lock, name, key); + if (name && key) + debug_rt_mutex_init(lock, name, key); } EXPORT_SYMBOL_GPL(__rt_mutex_init); -- cgit v1.2.3 From ac6424b981bce1c4bc55675c6ce11bfe1bbfa64f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Jun 2017 12:06:13 +0200 Subject: sched/wait: Rename wait_queue_t => wait_queue_entry_t Rename: wait_queue_t => wait_queue_entry_t 'wait_queue_t' was always a slight misnomer: its name implies that it's a "queue", but in reality it's a queue *entry*. The 'real' queue is the wait queue head, which had to carry the name. Start sorting this out by renaming it to 'wait_queue_entry_t'. This also allows the real structure name 'struct __wait_queue' to lose its double underscore and become 'struct wait_queue_entry', which is the more canonical nomenclature for such data types. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 ++-- kernel/futex.c | 2 +- kernel/sched/completion.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/wait.c | 42 +++++++++++++++++++++--------------------- kernel/workqueue.c | 4 ++-- 6 files changed, 28 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 516acdb0e0ec..7d694437ab44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1004,7 +1004,7 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; - wait_queue_t child_wait; + wait_queue_entry_t child_wait; int notask_error; }; @@ -1541,7 +1541,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } -static int child_wait_callback(wait_queue_t *wait, unsigned mode, +static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..d6cf71d08f21 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -225,7 +225,7 @@ struct futex_pi_state { * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * - * We use this hashed waitqueue, instead of a normal wait_queue_t, so + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 53f9558fa925..13fc5ae9bf2f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x, if (!x->done) { DECLARE_WAITQUEUE(wait, current); - __add_wait_queue_tail_exclusive(&x->wait, &wait); + __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 326d4f88e2b1..5b36644536ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3687,7 +3687,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, +int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { return try_to_wake_up(curr->private, mode, wake_flags); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index b8c84c6dee64..301ea02dede0 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -21,7 +21,7 @@ void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_c EXPORT_SYMBOL(__init_waitqueue_head); -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait) { unsigned long flags; @@ -32,18 +32,18 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(add_wait_queue); -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait) { unsigned long flags; wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); + __add_wait_queue_entry_tail(q, wait); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(add_wait_queue_exclusive); -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait) { unsigned long flags; @@ -66,7 +66,7 @@ EXPORT_SYMBOL(remove_wait_queue); static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { - wait_queue_t *curr, *next; + wait_queue_entry_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; @@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * loads to move into the critical region). */ void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) { unsigned long flags; @@ -184,20 +184,20 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) EXPORT_SYMBOL(prepare_to_wait); void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) { unsigned long flags; wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); + __add_wait_queue_entry_tail(q, wait); set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); -void init_wait_entry(wait_queue_t *wait, int flags) +void init_wait_entry(wait_queue_entry_t *wait, int flags) { wait->flags = flags; wait->private = current; @@ -206,7 +206,7 @@ void init_wait_entry(wait_queue_t *wait, int flags) } EXPORT_SYMBOL(init_wait_entry); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) { unsigned long flags; long ret = 0; @@ -230,7 +230,7 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) } else { if (list_empty(&wait->task_list)) { if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); + __add_wait_queue_entry_tail(q, wait); else __add_wait_queue(q, wait); } @@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event); * condition in the caller before they add the wait * entry to the wake queue. */ -int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) +int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) { if (likely(list_empty(&wait->task_list))) - __add_wait_queue_tail(wq, wait); + __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) } EXPORT_SYMBOL(do_wait_intr); -int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) +int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) { if (likely(list_empty(&wait->task_list))) - __add_wait_queue_tail(wq, wait); + __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -290,7 +290,7 @@ EXPORT_SYMBOL(do_wait_intr_irq); * the wait descriptor from the given waitqueue if still * queued. */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait) { unsigned long flags; @@ -316,7 +316,7 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -351,7 +351,7 @@ static inline bool is_kthread_should_stop(void) * remove_wait_queue(&wq, &wait); * */ -long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout) { set_current_state(mode); /* A */ /* @@ -375,7 +375,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) } EXPORT_SYMBOL(wait_woken); -int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { /* * Although this function is called under waitqueue lock, LOCK @@ -391,7 +391,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) } EXPORT_SYMBOL(woken_wake_function); -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; struct wait_bit_queue *wait_bit @@ -534,7 +534,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) return bit_waitqueue(p, 0); } -static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, +static int wake_atomic_t_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c74bf39ef764..a86688fabc55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work) EXPORT_SYMBOL_GPL(flush_work); struct cwt_wait { - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct *work; }; -static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); -- cgit v1.2.3 From 50816c48997af857d4bab3dca1aba90339705e96 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 10:33:16 +0100 Subject: sched/wait: Standardize internal naming of wait-queue entries So the various wait-queue entry variables in include/linux/wait.h and kernel/sched/wait.c are named in a colorfully inconsistent way: wait_queue_entry_t *wait wait_queue_entry_t *__wait (even in plain C code!) wait_queue_entry_t *q (!) wait_queue_entry_t *new (making anyone who knows C++ cringe) wait_queue_entry_t *old I think part of the reason for the inconsistency is the constant apparent confusion about what a wait queue 'head' versus 'entry' is. ( Some of the documentation talks about a 'wait descriptor', which is the wait-queue entry itself - further adding to the confusion. ) The most common name is 'wait', but that in itself is somewhat ambiguous as well, as it does not really make it clear whether it's a wait-queue entry or head. To improve all this name the wait-queue entry structure parameters and variables consistently and push through this naming into all the wait.h and wait.c code: struct wait_queue_entry *wq_entry The 'wq_' prefix makes it easy to grep for, and we also use the opportunity to move away from the typedef to a plain 'struct' naming: in the kernel we typically reserve typedefs for cases where a C structure is really small and somewhat opaque - such as pte_t. wait-queue entries are neither small nor opaque, so use the more standard 'struct xxx_entry' list management code nomenclature instead. ( We don't touch external users, and we preserve the typedef as well for actual wait-queue users, to reduce unnecessary churn. ) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 98 ++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 301ea02dede0..c37b3140763e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -21,34 +21,34 @@ void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_c EXPORT_SYMBOL(__init_waitqueue_head); -void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait) +void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { unsigned long flags; - wait->flags &= ~WQ_FLAG_EXCLUSIVE; + wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); + __add_wait_queue_entry_tail(q, wq_entry); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(add_wait_queue); -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait) +void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { unsigned long flags; - wait->flags |= WQ_FLAG_EXCLUSIVE; + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, wait); + __add_wait_queue_entry_tail(q, wq_entry); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(add_wait_queue_exclusive); -void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait) +void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); + __remove_wait_queue(q, wq_entry); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(remove_wait_queue); @@ -170,43 +170,43 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * loads to move into the critical region). */ void -prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) +prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; - wait->flags &= ~WQ_FLAG_EXCLUSIVE; + wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); + if (list_empty(&wq_entry->task_list)) + __add_wait_queue(q, wq_entry); set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) +prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; - wait->flags |= WQ_FLAG_EXCLUSIVE; + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_entry_tail(q, wait); + if (list_empty(&wq_entry->task_list)) + __add_wait_queue_entry_tail(q, wq_entry); set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); -void init_wait_entry(wait_queue_entry_t *wait, int flags) +void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { - wait->flags = flags; - wait->private = current; - wait->func = autoremove_wake_function; - INIT_LIST_HEAD(&wait->task_list); + wq_entry->flags = flags; + wq_entry->private = current; + wq_entry->func = autoremove_wake_function; + INIT_LIST_HEAD(&wq_entry->task_list); } EXPORT_SYMBOL(init_wait_entry); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) +long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; long ret = 0; @@ -225,14 +225,14 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int s * can't see us, it should wake up another exclusive waiter if * we fail. */ - list_del_init(&wait->task_list); + list_del_init(&wq_entry->task_list); ret = -ERESTARTSYS; } else { - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_entry_tail(q, wait); + if (list_empty(&wq_entry->task_list)) { + if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_entry_tail(q, wq_entry); else - __add_wait_queue(q, wait); + __add_wait_queue(q, wq_entry); } set_current_state(state); } @@ -284,13 +284,13 @@ EXPORT_SYMBOL(do_wait_intr_irq); /** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on - * @wait: wait descriptor + * @wq_entry: wait descriptor * * Sets current thread back to running state and removes * the wait descriptor from the given waitqueue if still * queued. */ -void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait) +void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -308,20 +308,20 @@ void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait) * have _one_ other CPU that looks at or modifies * the list). */ - if (!list_empty_careful(&wait->task_list)) { + if (!list_empty_careful(&wq_entry->task_list)) { spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); + list_del_init(&wq_entry->task_list); spin_unlock_irqrestore(&q->lock, flags); } } EXPORT_SYMBOL(finish_wait); -int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { - int ret = default_wake_function(wait, mode, sync, key); + int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wait->task_list); + list_del_init(&wq_entry->task_list); return ret; } EXPORT_SYMBOL(autoremove_wake_function); @@ -341,17 +341,17 @@ static inline bool is_kthread_should_stop(void) * * p->state = mode; condition = true; * smp_mb(); // A smp_wmb(); // C - * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN; * schedule() try_to_wake_up(); * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ - * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true; * smp_mb() // B smp_wmb(); // C - * wait->flags |= WQ_FLAG_WOKEN; + * wq_entry->flags |= WQ_FLAG_WOKEN; * } * remove_wait_queue(&wq, &wait); * */ -long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout) +long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) { set_current_state(mode); /* A */ /* @@ -359,7 +359,7 @@ long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout) * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must * also observe all state before the wakeup. */ - if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); @@ -369,13 +369,13 @@ long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout) * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss * an event. */ - smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ return timeout; } EXPORT_SYMBOL(wait_woken); -int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { /* * Although this function is called under waitqueue lock, LOCK @@ -385,24 +385,24 @@ int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void * and is paired with smp_store_mb() in wait_woken(). */ smp_wmb(); /* C */ - wait->flags |= WQ_FLAG_WOKEN; + wq_entry->flags |= WQ_FLAG_WOKEN; - return default_wake_function(wait, mode, sync, key); + return default_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(woken_wake_function); -int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); + = container_of(wq_entry, struct wait_bit_queue, wait); if (wait_bit->key.flags != key->flags || wait_bit->key.bit_nr != key->bit_nr || test_bit(key->bit_nr, key->flags)) return 0; else - return autoremove_wake_function(wait, mode, sync, key); + return autoremove_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(wake_bit_function); @@ -534,19 +534,19 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) return bit_waitqueue(p, 0); } -static int wake_atomic_t_function(wait_queue_entry_t *wait, unsigned mode, int sync, +static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); + = container_of(wq_entry, struct wait_bit_queue, wait); atomic_t *val = key->flags; if (wait_bit->key.flags != key->flags || wait_bit->key.bit_nr != key->bit_nr || atomic_read(val) != 0) return 0; - return autoremove_wake_function(wait, mode, sync, key); + return autoremove_wake_function(wq_entry, mode, sync, key); } /* -- cgit v1.2.3 From 9d9d676f595b5081326be7a17dc681fcb38fb3b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 11:10:18 +0100 Subject: sched/wait: Standardize internal naming of wait-queue heads The wait-queue head parameters and variables are named in a couple of ways, we have the following variants currently: wait_queue_head_t *q wait_queue_head_t *wq wait_queue_head_t *head In particular the 'wq' naming is ambiguous in the sense whether it's a wait-queue head or entry name - as entries were often named 'wait'. ( Not to mention the confusion of any readers coming over from workqueue-land. ) Standardize all this around a single, unambiguous parameter and variable name: struct wait_queue_head *wq_head which is easy to grep for and also rhymes nicely with the wait-queue entry naming: struct wait_queue_entry *wq_entry Also rename: struct __wait_queue_head => struct wait_queue_head ... and use this struct type to migrate from typedefs usage to 'struct' usage, which is more in line with existing kernel practices. Don't touch any external users and preserve the main wait_queue_head_t typedef. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 154 ++++++++++++++++++++++++++-------------------------- 1 file changed, 77 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c37b3140763e..203aeea96f16 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -12,44 +12,44 @@ #include #include -void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { - spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); - INIT_LIST_HEAD(&q->task_list); + spin_lock_init(&wq_head->lock); + lockdep_set_class_and_name(&wq_head->lock, key, name); + INIT_LIST_HEAD(&wq_head->task_list); } EXPORT_SYMBOL(__init_waitqueue_head); -void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, wq_entry); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue_entry_tail(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue); -void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, wq_entry); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue_entry_tail(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue_exclusive); -void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wq_entry); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __remove_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(remove_wait_queue); @@ -63,12 +63,12 @@ EXPORT_SYMBOL(remove_wait_queue); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_entry_t *curr, *next; - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + list_for_each_entry_safe(curr, next, &wq_head->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && @@ -79,7 +79,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, /** * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue + * @wq_head: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function @@ -87,35 +87,35 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, +void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __wake_up_common(wq_head, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(q, mode, nr, 0, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(q, mode, 1, 0, key); + __wake_up_common(wq_head, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue + * @wq_head: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets @@ -130,30 +130,30 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key); * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; int wake_flags = 1; /* XXX WF_SYNC */ - if (unlikely(!q)) + if (unlikely(!wq_head)) return; if (unlikely(nr_exclusive != 1)) wake_flags = 0; - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * __wake_up_sync - see __wake_up_sync_key() */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) { - __wake_up_sync_key(q, mode, nr_exclusive, NULL); + __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ @@ -170,30 +170,30 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * loads to move into the critical region). */ void -prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) +prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); if (list_empty(&wq_entry->task_list)) - __add_wait_queue(q, wq_entry); + __add_wait_queue(wq_head, wq_entry); set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); void -prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) +prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); if (list_empty(&wq_entry->task_list)) - __add_wait_queue_entry_tail(q, wq_entry); + __add_wait_queue_entry_tail(wq_head, wq_entry); set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); @@ -206,12 +206,12 @@ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) } EXPORT_SYMBOL(init_wait_entry); -long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) +long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; long ret = 0; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); if (unlikely(signal_pending_state(state, current))) { /* * Exclusive waiter must not fail if it was selected by wakeup, @@ -219,7 +219,7 @@ long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_ent * * The caller will recheck the condition and return success if * we were already woken up, we can not miss the event because - * wakeup locks/unlocks the same q->lock. + * wakeup locks/unlocks the same wq_head->lock. * * But we need to ensure that set-condition + wakeup after that * can't see us, it should wake up another exclusive waiter if @@ -230,13 +230,13 @@ long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_ent } else { if (list_empty(&wq_entry->task_list)) { if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_entry_tail(q, wq_entry); + __add_wait_queue_entry_tail(wq_head, wq_entry); else - __add_wait_queue(q, wq_entry); + __add_wait_queue(wq_head, wq_entry); } set_current_state(state); } - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); return ret; } @@ -283,14 +283,14 @@ EXPORT_SYMBOL(do_wait_intr_irq); /** * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on + * @wq_head: waitqueue waited on * @wq_entry: wait descriptor * * Sets current thread back to running state and removes * the wait descriptor from the given waitqueue if still * queued. */ -void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -309,9 +309,9 @@ void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) * the list). */ if (!list_empty_careful(&wq_entry->task_list)) { - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); list_del_init(&wq_entry->task_list); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } } EXPORT_SYMBOL(finish_wait); @@ -334,7 +334,7 @@ static inline bool is_kthread_should_stop(void) /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * - * add_wait_queue(&wq, &wait); + * add_wait_queue(&wq_head, &wait); * for (;;) { * if (condition) * break; @@ -348,7 +348,7 @@ static inline bool is_kthread_should_stop(void) * smp_mb() // B smp_wmb(); // C * wq_entry->flags |= WQ_FLAG_WOKEN; * } - * remove_wait_queue(&wq, &wait); + * remove_wait_queue(&wq_head, &wait); * */ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) @@ -412,17 +412,17 @@ EXPORT_SYMBOL(wake_bit_function); * permitted return codes. Nonzero return codes halt waiting and return. */ int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, +__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) { int ret = 0; do { - prepare_to_wait(wq, &q->wait, mode); + prepare_to_wait(wq_head, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); + finish_wait(wq_head, &q->wait); return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -430,10 +430,10 @@ EXPORT_SYMBOL(__wait_on_bit); int __sched out_of_line_wait_on_bit(void *word, int bit, wait_bit_action_f *action, unsigned mode) { - wait_queue_head_t *wq = bit_waitqueue(word, bit); + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); - return __wait_on_bit(wq, &wait, action, mode); + return __wait_on_bit(wq_head, &wait, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_bit); @@ -441,36 +441,36 @@ int __sched out_of_line_wait_on_bit_timeout( void *word, int bit, wait_bit_action_f *action, unsigned mode, unsigned long timeout) { - wait_queue_head_t *wq = bit_waitqueue(word, bit); + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); wait.key.timeout = jiffies + timeout; - return __wait_on_bit(wq, &wait, action, mode); + return __wait_on_bit(wq_head, &wait, action, mode); } EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, +__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) { int ret = 0; for (;;) { - prepare_to_wait_exclusive(wq, &q->wait, mode); + prepare_to_wait_exclusive(wq_head, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) { ret = action(&q->key, mode); /* * See the comment in prepare_to_wait_event(). - * finish_wait() does not necessarily takes wq->lock, + * finish_wait() does not necessarily takes wwq_head->lock, * but test_and_set_bit() implies mb() which pairs with * smp_mb__after_atomic() before wake_up_page(). */ if (ret) - finish_wait(wq, &q->wait); + finish_wait(wq_head, &q->wait); } if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { if (!ret) - finish_wait(wq, &q->wait); + finish_wait(wq_head, &q->wait); return 0; } else if (ret) { return ret; @@ -482,18 +482,18 @@ EXPORT_SYMBOL(__wait_on_bit_lock); int __sched out_of_line_wait_on_bit_lock(void *word, int bit, wait_bit_action_f *action, unsigned mode) { - wait_queue_head_t *wq = bit_waitqueue(word, bit); + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); - return __wait_on_bit_lock(wq, &wait, action, mode); + return __wait_on_bit_lock(wq_head, &wait, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); + if (waitqueue_active(wq_head)) + __wake_up(wq_head, TASK_NORMAL, 1, &key); } EXPORT_SYMBOL(__wake_up_bit); @@ -555,20 +555,20 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo * return codes halt waiting and return. */ static __sched -int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, +int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q, int (*action)(atomic_t *), unsigned mode) { atomic_t *val; int ret = 0; do { - prepare_to_wait(wq, &q->wait, mode); + prepare_to_wait(wq_head, &q->wait, mode); val = q->key.flags; if (atomic_read(val) == 0) break; ret = (*action)(val); } while (!ret && atomic_read(val) != 0); - finish_wait(wq, &q->wait); + finish_wait(wq_head, &q->wait); return ret; } @@ -586,10 +586,10 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), unsigned mode) { - wait_queue_head_t *wq = atomic_t_waitqueue(p); + struct wait_queue_head *wq_head = atomic_t_waitqueue(p); DEFINE_WAIT_ATOMIC_T(wait, p); - return __wait_on_atomic_t(wq, &wait, action, mode); + return __wait_on_atomic_t(wq_head, &wait, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); -- cgit v1.2.3 From 2141713616c652aeabf2dd5c1e89bc601c4fed6a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 11:25:39 +0100 Subject: sched/wait: Standardize 'struct wait_bit_queue' wait-queue entry field name Rename 'struct wait_bit_queue::wait' to ::wq_entry, to more clearly name it as a wait-queue entry. Propagate it to a couple of usage sites where the wait-bit-queue internals are exposed. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 203aeea96f16..f1ba0625b8be 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -395,7 +395,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync { struct wait_bit_key *key = arg; struct wait_bit_queue *wait_bit - = container_of(wq_entry, struct wait_bit_queue, wait); + = container_of(wq_entry, struct wait_bit_queue, wq_entry); if (wait_bit->key.flags != key->flags || wait_bit->key.bit_nr != key->bit_nr || @@ -418,11 +418,11 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q, int ret = 0; do { - prepare_to_wait(wq_head, &q->wait, mode); + prepare_to_wait(wq_head, &q->wq_entry, mode); if (test_bit(q->key.bit_nr, q->key.flags)) ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq_head, &q->wait); + finish_wait(wq_head, &q->wq_entry); return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -431,9 +431,9 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); - return __wait_on_bit(wq_head, &wait, action, mode); + return __wait_on_bit(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_bit); @@ -442,10 +442,10 @@ int __sched out_of_line_wait_on_bit_timeout( unsigned mode, unsigned long timeout) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); - wait.key.timeout = jiffies + timeout; - return __wait_on_bit(wq_head, &wait, action, mode); + wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); @@ -456,7 +456,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, int ret = 0; for (;;) { - prepare_to_wait_exclusive(wq_head, &q->wait, mode); + prepare_to_wait_exclusive(wq_head, &q->wq_entry, mode); if (test_bit(q->key.bit_nr, q->key.flags)) { ret = action(&q->key, mode); /* @@ -466,11 +466,11 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, * smp_mb__after_atomic() before wake_up_page(). */ if (ret) - finish_wait(wq_head, &q->wait); + finish_wait(wq_head, &q->wq_entry); } if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { if (!ret) - finish_wait(wq_head, &q->wait); + finish_wait(wq_head, &q->wq_entry); return 0; } else if (ret) { return ret; @@ -483,9 +483,9 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); - return __wait_on_bit_lock(wq_head, &wait, action, mode); + return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); @@ -538,8 +538,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo void *arg) { struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wq_entry, struct wait_bit_queue, wait); + struct wait_bit_queue *wait_bit = container_of(wq_entry, struct wait_bit_queue, wq_entry); atomic_t *val = key->flags; if (wait_bit->key.flags != key->flags || @@ -562,24 +561,24 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q int ret = 0; do { - prepare_to_wait(wq_head, &q->wait, mode); + prepare_to_wait(wq_head, &q->wq_entry, mode); val = q->key.flags; if (atomic_read(val) == 0) break; ret = (*action)(val); } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &q->wait); + finish_wait(wq_head, &q->wq_entry); return ret; } #define DEFINE_WAIT_ATOMIC_T(name, p) \ struct wait_bit_queue name = { \ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wait = { \ + .wq_entry = { \ .private = current, \ .func = wake_atomic_t_function, \ .task_list = \ - LIST_HEAD_INIT((name).wait.task_list), \ + LIST_HEAD_INIT((name).wq_entry.task_list), \ }, \ } @@ -587,9 +586,9 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), unsigned mode) { struct wait_queue_head *wq_head = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wait, p); + DEFINE_WAIT_ATOMIC_T(wq_entry, p); - return __wait_on_atomic_t(wq_head, &wait, action, mode); + return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); -- cgit v1.2.3 From 76c85ddc4695bb7b8209bfeff11f5156088f9197 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 11:35:27 +0100 Subject: sched/wait: Standardize wait_bit_queue naming So wait-bit-queue head variables are often named: struct wait_bit_queue *q ... which is a bit ambiguous and super confusing, because they clearly suggest wait-queue head semantics and behavior (they rhyme with the old wait_queue_t *q naming), while they are extended wait-queue _entries_, not heads! They are misnomers in two ways: - the 'wait_bit_queue' leaves open the question of whether it's an entry or a head - the 'q' parameter and local variable naming falsely implies that it's a 'queue' - while it's an entry. This resulted in sometimes confusing cases such as: finish_wait(wq, &q->wait); where the 'q' is not a wait-queue head, but a wait-bit-queue entry. So improve this all by standardizing wait-bit-queue nomenclature similar to wait-queue head naming: struct wait_bit_queue => struct wait_bit_queue_entry q => wbq_entry Which makes it all a much clearer: struct wait_bit_queue_entry *wbq_entry ... and turns the former confusing piece of code into: finish_wait(wq_head, &wbq_entry->wq_entry; which IMHO makes it apparently clear what we are doing, without having to analyze the context of the code: we are adding a wait-queue entry to a regular wait-queue head, which entry is embedded in a wait-bit-queue entry. I'm not a big fan of acronyms, but repeating wait_bit_queue_entry in field and local variable names is too long, so Hopefully it's clear enough that 'wq_' prefixes stand for wait-queues, while 'wbq_' prefixes stand for wait-bit-queues. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f1ba0625b8be..95e6d3820cba 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -394,8 +394,7 @@ EXPORT_SYMBOL(woken_wake_function); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wq_entry, struct wait_bit_queue, wq_entry); + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); if (wait_bit->key.flags != key->flags || wait_bit->key.bit_nr != key->bit_nr || @@ -412,17 +411,17 @@ EXPORT_SYMBOL(wake_bit_function); * permitted return codes. Nonzero return codes halt waiting and return. */ int __sched -__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q, +__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned mode) { int ret = 0; do { - prepare_to_wait(wq_head, &q->wq_entry, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key, mode); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq_head, &q->wq_entry); + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) + ret = (*action)(&wbq_entry->key, mode); + } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -450,15 +449,15 @@ int __sched out_of_line_wait_on_bit_timeout( EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); int __sched -__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, +__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned mode) { int ret = 0; for (;;) { - prepare_to_wait_exclusive(wq_head, &q->wq_entry, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) { - ret = action(&q->key, mode); + prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + ret = action(&wbq_entry->key, mode); /* * See the comment in prepare_to_wait_event(). * finish_wait() does not necessarily takes wwq_head->lock, @@ -466,11 +465,11 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, * smp_mb__after_atomic() before wake_up_page(). */ if (ret) - finish_wait(wq_head, &q->wq_entry); + finish_wait(wq_head, &wbq_entry->wq_entry); } - if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { + if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { if (!ret) - finish_wait(wq_head, &q->wq_entry); + finish_wait(wq_head, &wbq_entry->wq_entry); return 0; } else if (ret) { return ret; @@ -538,7 +537,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo void *arg) { struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit = container_of(wq_entry, struct wait_bit_queue, wq_entry); + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); atomic_t *val = key->flags; if (wait_bit->key.flags != key->flags || @@ -554,25 +553,25 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo * return codes halt waiting and return. */ static __sched -int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q, +int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, int (*action)(atomic_t *), unsigned mode) { atomic_t *val; int ret = 0; do { - prepare_to_wait(wq_head, &q->wq_entry, mode); - val = q->key.flags; + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + val = wbq_entry->key.flags; if (atomic_read(val) == 0) break; ret = (*action)(val); } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &q->wq_entry); + finish_wait(wq_head, &wbq_entry->wq_entry); return ret; } #define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue name = { \ + struct wait_bit_queue_entry name = { \ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ .wq_entry = { \ .private = current, \ -- cgit v1.2.3 From 5dd43ce2f69d42a71dcacdb13d17d8c0ac1fe8f7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Jun 2017 12:19:09 +0200 Subject: sched/wait: Split out the wait_bit*() APIs from into The wait_bit*() types and APIs are mixed into wait.h, but they are a pretty orthogonal extension of wait-queues. Furthermore, only about 50 kernel files use these APIs, while over 1000 use the regular wait-queue functionality. So clean up the main wait.h by moving the wait-bit functionality out of it, into a separate .h and .c file: include/linux/wait_bit.h for types and APIs kernel/sched/wait_bit.c for the implementation Update all header dependencies. This reduces the size of wait.h rather significantly, by about 30%. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 2 +- kernel/sched/wait.c | 257 ---------------------------------------------- kernel/sched/wait_bit.c | 263 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+), 258 deletions(-) create mode 100644 kernel/sched/wait_bit.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 89ab6758667b..16277e2ed8ee 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -17,7 +17,7 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o swait.o completion.o idle.o +obj-y += wait.o wait_bit.o swait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 95e6d3820cba..6bcd7c3c4501 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -390,260 +390,3 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sy return default_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(woken_wake_function); - -int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wq_entry, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - wait_bit_action_f *action, unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); - if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) - ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); - finish_wait(wq_head, &wbq_entry->wq_entry); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - wait_bit_action_f *action, unsigned mode) -{ - struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wq_entry, word, bit); - - return __wait_on_bit(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, - unsigned mode, unsigned long timeout) -{ - struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wq_entry, word, bit); - - wq_entry.key.timeout = jiffies + timeout; - return __wait_on_bit(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); - -int __sched -__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - wait_bit_action_f *action, unsigned mode) -{ - int ret = 0; - - for (;;) { - prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); - if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { - ret = action(&wbq_entry->key, mode); - /* - * See the comment in prepare_to_wait_event(). - * finish_wait() does not necessarily takes wwq_head->lock, - * but test_and_set_bit() implies mb() which pairs with - * smp_mb__after_atomic() before wake_up_page(). - */ - if (ret) - finish_wait(wq_head, &wbq_entry->wq_entry); - } - if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { - if (!ret) - finish_wait(wq_head, &wbq_entry->wq_entry); - return 0; - } else if (ret) { - return ret; - } - } -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - wait_bit_action_f *action, unsigned mode) -{ - struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wq_entry, word, bit); - - return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) -{ - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq_head)) - __wake_up(wq_head, TASK_NORMAL, 1, &key); -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) -{ - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); -} - -static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, - void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - atomic_t *val = key->flags; - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) - return 0; - return autoremove_wake_function(wq_entry, mode, sync, key); -} - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - int (*action)(atomic_t *), unsigned mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); - val = wbq_entry->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &wbq_entry->wq_entry); - return ret; -} - -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue_entry name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wq_entry = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wq_entry.task_list), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) -{ - struct wait_queue_head *wq_head = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wq_entry, p); - - return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); - -__sched int bit_wait(struct wait_bit_key *word, int mode) -{ - schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL(bit_wait); - -__sched int bit_wait_io(struct wait_bit_key *word, int mode) -{ - io_schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL(bit_wait_io); - -__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_timeout); - -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c new file mode 100644 index 000000000000..463bac84dfd1 --- /dev/null +++ b/kernel/sched/wait_bit.c @@ -0,0 +1,263 @@ +/* + * The implementation of the wait_bit*() and related waiting APIs: + */ +#include +#include +#include + +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wq_entry, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + wait_bit_action_f *action, unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) + ret = (*action)(&wbq_entry->key, mode); + } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + wait_bit_action_f *action, unsigned mode) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + return __wait_on_bit(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + +int __sched +__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + wait_bit_action_f *action, unsigned mode) +{ + int ret = 0; + + for (;;) { + prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + ret = action(&wbq_entry->key, mode); + /* + * See the comment in prepare_to_wait_event(). + * finish_wait() does not necessarily takes wwq_head->lock, + * but test_and_set_bit() implies mb() which pairs with + * smp_mb__after_atomic() before wake_up_page(). + */ + if (ret) + finish_wait(wq_head, &wbq_entry->wq_entry); + } + if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + if (!ret) + finish_wait(wq_head, &wbq_entry->wq_entry); + return 0; + } else if (ret) { + return ret; + } + } +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + wait_bit_action_f *action, unsigned mode) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq_head)) + __wake_up(wq_head, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_atomic(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wq_entry, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + val = wbq_entry->key.flags; + if (atomic_read(val) == 0) + break; + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue_entry name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wq_entry = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wq_entry.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + struct wait_queue_head *wq_head = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wq_entry, p); + + return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(struct wait_bit_key *word, int mode) +{ + schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(struct wait_bit_key *word, int mode) +{ + io_schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) +{ + unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) +{ + unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); -- cgit v1.2.3 From 5822a454d6d22297c5fcd66264120587b2ec21cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 13:09:07 +0100 Subject: sched/wait: Move bit_wait_table[] and related functionality from sched/core.c to sched/wait_bit.c The key hashed waitqueue data structures and their initialization was done in the main scheduler file for no good reason, move them to sched/wait_bit.c instead. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 ++---------------- kernel/sched/wait_bit.c | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b36644536ab..e7b9ef8df126 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -6026,28 +6027,13 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); -#define WAIT_TABLE_BITS 8 -#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) -static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - unsigned long val = (unsigned long)word << shift | bit; - - return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); -} -EXPORT_SYMBOL(bit_waitqueue); - void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; sched_clock_init(); - - for (i = 0; i < WAIT_TABLE_SIZE; i++) - init_waitqueue_head(bit_wait_table + i); + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 463bac84dfd1..c891b34e1896 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -4,6 +4,21 @@ #include #include #include +#include + +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) + +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) { @@ -261,3 +276,11 @@ __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); + +void __init wait_bit_init(void) +{ + int i; + + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); +} -- cgit v1.2.3 From 2055da97389a605c8a00d163d40903afbe413921 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Jun 2017 12:06:46 +0200 Subject: sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming So I've noticed a number of instances where it was not obvious from the code whether ->task_list was for a wait-queue head or a wait-queue entry. Furthermore, there's a number of wait-queue users where the lists are not for 'tasks' but other entities (poll tables, etc.), in which case the 'task_list' name is actively confusing. To clear this all up, name the wait-queue head and entry list structure fields unambiguously: struct wait_queue_head::task_list => ::head struct wait_queue_entry::task_list => ::entry For example, this code: rqw->wait.task_list.next != &wait->task_list ... is was pretty unclear (to me) what it's doing, while now it's written this way: rqw->wait.head.next != &wait->entry ... which makes it pretty clear that we are iterating a list until we see the head. Other examples are: list_for_each_entry_safe(pos, next, &x->task_list, task_list) { list_for_each_entry(wq, &fence->wait.task_list, task_list) { ... where it's unclear (to me) what we are iterating, and during review it's hard to tell whether it's trying to walk a wait-queue entry (which would be a bug), while now it's written as: list_for_each_entry_safe(pos, next, &x->head, entry) { list_for_each_entry(wq, &fence->wait.head, entry) { Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 24 ++++++++++++------------ kernel/sched/wait_bit.c | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 6bcd7c3c4501..17f11c6b0a9f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -16,7 +16,7 @@ void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, st { spin_lock_init(&wq_head->lock); lockdep_set_class_and_name(&wq_head->lock, key, name); - INIT_LIST_HEAD(&wq_head->task_list); + INIT_LIST_HEAD(&wq_head->head); } EXPORT_SYMBOL(__init_waitqueue_head); @@ -68,7 +68,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, { wait_queue_entry_t *curr, *next; - list_for_each_entry_safe(curr, next, &wq_head->task_list, task_list) { + list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && @@ -176,7 +176,7 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->task_list)) + if (list_empty(&wq_entry->entry)) __add_wait_queue(wq_head, wq_entry); set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); @@ -190,7 +190,7 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->task_list)) + if (list_empty(&wq_entry->entry)) __add_wait_queue_entry_tail(wq_head, wq_entry); set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); @@ -202,7 +202,7 @@ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) wq_entry->flags = flags; wq_entry->private = current; wq_entry->func = autoremove_wake_function; - INIT_LIST_HEAD(&wq_entry->task_list); + INIT_LIST_HEAD(&wq_entry->entry); } EXPORT_SYMBOL(init_wait_entry); @@ -225,10 +225,10 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_en * can't see us, it should wake up another exclusive waiter if * we fail. */ - list_del_init(&wq_entry->task_list); + list_del_init(&wq_entry->entry); ret = -ERESTARTSYS; } else { - if (list_empty(&wq_entry->task_list)) { + if (list_empty(&wq_entry->entry)) { if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) __add_wait_queue_entry_tail(wq_head, wq_entry); else @@ -251,7 +251,7 @@ EXPORT_SYMBOL(prepare_to_wait_event); */ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) { - if (likely(list_empty(&wait->task_list))) + if (likely(list_empty(&wait->entry))) __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); @@ -267,7 +267,7 @@ EXPORT_SYMBOL(do_wait_intr); int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) { - if (likely(list_empty(&wait->task_list))) + if (likely(list_empty(&wait->entry))) __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); @@ -308,9 +308,9 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en * have _one_ other CPU that looks at or modifies * the list). */ - if (!list_empty_careful(&wq_entry->task_list)) { + if (!list_empty_careful(&wq_entry->entry)) { spin_lock_irqsave(&wq_head->lock, flags); - list_del_init(&wq_entry->task_list); + list_del_init(&wq_entry->entry); spin_unlock_irqrestore(&wq_head->lock, flags); } } @@ -321,7 +321,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->task_list); + list_del_init(&wq_entry->entry); return ret; } EXPORT_SYMBOL(autoremove_wake_function); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index c891b34e1896..f8159698aa4d 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -205,8 +205,8 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en .wq_entry = { \ .private = current, \ .func = wake_atomic_t_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wq_entry.task_list), \ + .entry = \ + LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } -- cgit v1.2.3 From 6d3aed3d8a0573d0a6eb1160ccd0a0713f4dbc2f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Jun 2017 12:24:42 +0200 Subject: sched/debug: Fix SCHED_WARN_ON() to return a value on !CONFIG_SCHED_DEBUG as well This definition of SCHED_WARN_ON(): #define SCHED_WARN_ON(x) ((void)(x)) is not fully compatible with the 'real' WARN_ON_ONCE() primitive, as it has no return value, so it cannot be used in conditionals. Fix it. Cc: Daniel Axtens Cc: Konstantin Khlebnikov Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Thomas Gleixner Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f2ef759a4cb6..e0329d10bdb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -39,9 +39,9 @@ #include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG -#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -#define SCHED_WARN_ON(x) ((void)(x)) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; -- cgit v1.2.3 From c5ae366e12b2bd56fc7d7e9d484836bec9ddc110 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 11 May 2017 06:11:39 +1000 Subject: sched/fair: WARN() and refuse to set buddy when !se->on_rq If we set a next or last buddy for a se that is not on_rq, we will end up taking a NULL pointer dereference in wakeup_preempt_entity via pick_next_task_fair. Detect when we would be about to do that, throw a warning and then refuse to actually set it. This has been suggested at least twice: https://marc.info/?l=linux-kernel&m=146651668921468&w=2 https://lkml.org/lkml/2016/6/16/663 I recently had to debug a problem with these (we hadn't backported Konstantin's patches in this area) and this would have saved a lot of time/pain. Just do it. Signed-off-by: Daniel Axtens Cc: Ben Segall Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170510201139.16236-1-dja@axtens.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 396bca9c7996..cb3a3da7089f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6164,8 +6164,11 @@ static void set_last_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->last = se; + } } static void set_next_buddy(struct sched_entity *se) @@ -6173,8 +6176,11 @@ static void set_next_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->next = se; + } } static void set_skip_buddy(struct sched_entity *se) -- cgit v1.2.3 From f11cc0760b8397e0d230122606421b6a96e9f869 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 14 Jun 2017 19:37:30 -0700 Subject: sched/core: Drop the unused try_get_task_struct() helper function This function was introduced by: 150593bf8693 ("sched/api: Introduce task_rcu_dereference() and try_get_task_struct()") ... to allow easier usage of task_rcu_dereference(), however no users were ever added. Drop the helper. Signed-off-by: Davidlohr Bueso Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/20170615023730.22827-1-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/exit.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 7d694437ab44..c63226283aef 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -318,19 +318,6 @@ void rcuwait_wake_up(struct rcuwait *w) rcu_read_unlock(); } -struct task_struct *try_get_task_struct(struct task_struct **ptask) -{ - struct task_struct *task; - - rcu_read_lock(); - task = task_rcu_dereference(ptask); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - return task; -} - /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected -- cgit v1.2.3 From d15bc69affc57d7985a01745ca28eafa0772325b Mon Sep 17 00:00:00 2001 From: Peter Meerwald-Stadler Date: Tue, 30 May 2017 21:41:03 +0200 Subject: timers: Fix parameter description of try_to_del_timer_sync() Signed-off-by: Peter Meerwald-Stadler Link: http://lkml.kernel.org/r/20170530194103.7454-1-pmeerw@pmeerw.net Cc: John Stultz Cc: trivial@rustcorp.com.au Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 152a706ef8b8..709a404bd133 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(del_timer); /** * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer do del + * @timer: timer to delete * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. -- cgit v1.2.3 From 35eb7258c009dc478338e674a5a84d25d0929c56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 17:37:35 +0200 Subject: itimer: Make timeval to nsec conversion range limited The expiry time of a itimer is supplied through sys_setitimer() via a struct timeval. The timeval is validated for correctness. In the actual set timer implementation the timeval is converted to a scalar nanoseconds value. If the tv_sec part of the time spec is large enough the conversion to nanoseconds (sec * NSEC_PER_SEC) overflows 64bit. Mitigate that by using the timeval_to_ktime() conversion function, which checks the tv_sec part for a potential mult overflow and clamps the result to KTIME_MAX, which is about 292 years. Reported-by: Xishi Qiu Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170620154113.505981643@linutronix.de --- kernel/time/itimer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 9dd7ff5e445a..2ef98a02376a 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -152,8 +152,12 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; - nval = timeval_to_ns(&value->it_value); - ninterval = timeval_to_ns(&value->it_interval); + /* + * Use the to_ktime conversion because that clamps the maximum + * value to KTIME_MAX and avoid multiplication overflows. + */ + nval = ktime_to_ns(timeval_to_ktime(value->it_value)); + ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); spin_lock_irq(&tsk->sighand->siglock); -- cgit v1.2.3 From 098b0e01a91c42aaaf0425605cd126b03fcb0bcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 17:37:36 +0200 Subject: posix-cpu-timers: Make timespec to nsec conversion safe The expiry time of a posix cpu timer is supplied through sys_timer_set() via a struct timespec. The timespec is validated for correctness. In the actual set timer implementation the timespec is converted to a scalar nanoseconds value. If the tv_sec part of the time spec is large enough the conversion to nanoseconds (sec * NSEC_PER_SEC) overflows 64bit. Mitigate that by using the timespec_to_ktime() conversion function, which checks the tv_sec part for a potential mult overflow and clamps the result to KTIME_MAX, which is about 292 years. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Xishi Qiu Cc: John Stultz Link: http://lkml.kernel.org/r/20170620154113.588276707@linutronix.de --- kernel/time/posix-cpu-timers.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 9df618ee64cf..60cb24ac9ebc 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -580,7 +580,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, WARN_ON_ONCE(p == NULL); - new_expires = timespec64_to_ns(&new->it_value); + /* + * Use the to_ktime conversion because that clamps the maximum + * value to KTIME_MAX and avoid multiplication overflows. + */ + new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value)); /* * Protect against sighand release/switch in exit/exec and p->cpu_timers -- cgit v1.2.3 From fc6eead7c1e2e5376c25d2795d4539fdacbc0648 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 22 May 2017 17:20:20 -0700 Subject: time: Clean up CLOCK_MONOTONIC_RAW time handling Now that we fixed the sub-ns handling for CLOCK_MONOTONIC_RAW, remove the duplicitive tk->raw_time.tv_nsec, which can be stored in tk->tkr_raw.xtime_nsec (similarly to how its handled for monotonic time). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Cc: Kevin Brodsky Cc: Will Deacon Cc: Daniel Mentz Tested-by: Daniel Mentz Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b602c48cb841..0454bfa24353 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -72,6 +72,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } + while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { + tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + tk->raw_sec++; + } } static inline struct timespec64 tk_xtime(struct timekeeper *tk) @@ -285,12 +289,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; - if (shift_change < 0) + if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; - else + tk->tkr_raw.xtime_nsec >>= -shift_change; + } else { tk->tkr_mono.xtime_nsec <<= shift_change; + tk->tkr_raw.xtime_nsec <<= shift_change; + } } - tk->tkr_raw.xtime_nsec = 0; tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; @@ -619,9 +625,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); - /* Update the monotonic raw base */ - tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); - /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take @@ -631,6 +634,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; + + /* Update the monotonic raw base */ + seconds = tk->raw_sec; + nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); + tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); } /* must hold timekeeper_lock */ @@ -672,7 +680,6 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; - u64 nsec; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); @@ -684,10 +691,13 @@ static void timekeeping_forward_now(struct timekeeper *tk) /* If arch requires, add in get_arch_timeoffset() */ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; - tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); - timespec64_add_ns(&tk->raw_time, nsec); + tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + + /* If arch requires, add in get_arch_timeoffset() */ + tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; + + tk_normalize_xtime(tk); } /** @@ -1373,19 +1383,18 @@ int timekeeping_notify(struct clocksource *clock) void getrawmonotonic64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts64; unsigned long seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); - ts64 = tk->raw_time; } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec64_add_ns(&ts64, nsecs); - *ts = ts64; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic64); @@ -1509,8 +1518,7 @@ void __init timekeeping_init(void) tk_setup_internals(tk, clock); tk_set_xtime(tk, &now); - tk->raw_time.tv_sec = 0; - tk->raw_time.tv_nsec = 0; + tk->raw_sec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); @@ -2011,15 +2019,12 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; - tk->raw_time.tv_sec++; + tk->raw_sec++; } - tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; - tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; -- cgit v1.2.3 From 369adf04d80a7e179b9ea6d74cc01c233f142f47 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 12 May 2017 10:58:18 -0700 Subject: time: Add warning about imminent deprecation of CONFIG_GENERIC_TIME_VSYSCALL_OLD CONFIG_GENERIC_TIME_VSYSCALL_OLD was introduced five years ago to allow a transition from the old vsyscall implementations to the new method (which simplified internal accounting and made timekeeping more precise). However, PPC and IA64 have yet to make the transition, despite in some cases me sending test patches to try to help it along. http://patches.linaro.org/patch/30501/ http://patches.linaro.org/patch/35412/ If its helpful, my last pass at the patches can be found here: https://git.linaro.org/people/john.stultz/linux.git dev/oldvsyscall-cleanup So I think its time to set a deadline and make it clear this is going away. So this patch adds warnings about this functionality being dropped. Likely to be in v4.15. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Marcelo Tosatti Cc: Paul Mackerras Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Tony Luck Cc: Michael Ellerman Cc: Fenghua Yu Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0454bfa24353..cedafa008de5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -516,6 +516,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk) } #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon. static inline void update_vsyscall(struct timekeeper *tk) { -- cgit v1.2.3 From 8a1898db51a3390241cd5fae267dc8aaa9db0f8b Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Tue, 20 Jun 2017 12:26:39 +0200 Subject: perf/aux: Correct return code of rb_alloc_aux() if !has_aux(ev) If the event for which an AUX area is about to be allocated, does not support setting up an AUX area, rb_alloc_aux() return -ENOTSUPP. This error condition is being returned unfiltered to the user space, and, for example, the perf tools fails with: failed to mmap with 524 (INTERNAL ERROR: strerror_r(524, 0x3fff497a1c8, 512)=22) This error can be easily seen with "perf record -m 128,256 -e cpu-clock". The 524 error code maps to -ENOTSUPP (in rb_alloc_aux()). The -ENOTSUPP error code shall be only used within the kernel. So the correct error code would then be -EOPNOTSUPP. With this commit, the perf tool then reports: failed to mmap with 95 (Operation not supported) which is more clear. Signed-off-by: Hendrik Brueckner Acked-by: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pu Hou Cc: Thomas Gleixner Cc: Thomas-Mich Richter Cc: acme@kernel.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1497954399-6355-1-git-send-email-brueckner@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 2831480c63a2..ee97196bb151 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -580,7 +580,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, int ret = -ENOMEM, max_order = 0; if (!has_aux(event)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { /* -- cgit v1.2.3 From f160203986a6ad23ab8077c4a25b260fe55d6e26 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 May 2017 18:06:58 +0200 Subject: irq/generic-chip: Export irq_init_generic_chip() locally This function will be used in the devres variant of irq_alloc_generic_chip(). Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: linux-doc@vger.kernel.org Cc: Jonathan Corbet Link: http://lkml.kernel.org/r/1496246820-13250-4-git-send-email-brgl@bgdev.pl --- kernel/irq/generic-chip.c | 7 +++---- kernel/irq/internals.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ee32870079c9..f7086b78ad6e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -201,10 +201,9 @@ static void irq_writel_be(u32 val, void __iomem *addr) iowrite32be(val, addr); } -static void -irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, - int num_ct, unsigned int irq_base, - void __iomem *reg_base, irq_flow_handler_t handler) +void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) { raw_spin_lock_init(&gc->lock); gc->num_ct = num_ct; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index bc226e783bd2..921a2419720c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -226,3 +226,14 @@ irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } #endif + +#ifdef CONFIG_GENERIC_IRQ_CHIP +void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler); +#else +static inline void +irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) { } +#endif /* CONFIG_GENERIC_IRQ_CHIP */ -- cgit v1.2.3 From 1c3e36309fe2e94b8a889fa32cb5c871434f8ed6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 May 2017 18:06:59 +0200 Subject: irq/generic-chip: Provide devm_irq_alloc_generic_chip() Provide a resource managed variant of irq_alloc_generic_chip(). Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: linux-doc@vger.kernel.org Cc: Jonathan Corbet Link: http://lkml.kernel.org/r/1496246820-13250-5-git-send-email-brgl@bgdev.pl --- kernel/irq/devres.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1613bfd48365..21ee0aebccfb 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -4,6 +4,8 @@ #include #include +#include "internals.h" + /* * Device resource management aware IRQ request/free implementation. */ @@ -198,3 +200,35 @@ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, return base; } EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs); + +#ifdef CONFIG_GENERIC_IRQ_CHIP +/** + * devm_irq_alloc_generic_chip - Allocate and initialize a generic chip + * for a managed device + * @dev: Device to allocate the generic chip for + * @name: Name of the irq chip + * @num_ct: Number of irq_chip_type instances associated with this + * @irq_base: Interrupt base nr for this chip + * @reg_base: Register base address (virtual) + * @handler: Default flow handler associated with this chip + * + * Returns an initialized irq_chip_generic structure. The chip defaults + * to the primary (index 0) irq_chip_type and @handler + */ +struct irq_chip_generic * +devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, + unsigned int irq_base, void __iomem *reg_base, + irq_flow_handler_t handler) +{ + struct irq_chip_generic *gc; + unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + + gc = devm_kzalloc(dev, sz, GFP_KERNEL); + if (gc) + irq_init_generic_chip(gc, name, num_ct, + irq_base, reg_base, handler); + + return gc; +} +EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip); +#endif /* CONFIG_GENERIC_IRQ_CHIP */ -- cgit v1.2.3 From 30fd8fc5c91973485705f83c7efe9588b8e6f371 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 May 2017 18:07:00 +0200 Subject: irq/generic-chip: Provide devm_irq_setup_generic_chip() Provide a resource managed variant of irq_setup_generic_chip(). Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: linux-doc@vger.kernel.org Cc: Jonathan Corbet Link: http://lkml.kernel.org/r/1496246820-13250-6-git-send-email-brgl@bgdev.pl --- kernel/irq/devres.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 21ee0aebccfb..194c506d9d20 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -231,4 +231,56 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, return gc; } EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip); + +struct irq_generic_chip_devres { + struct irq_chip_generic *gc; + u32 msk; + unsigned int clr; + unsigned int set; +}; + +static void devm_irq_remove_generic_chip(struct device *dev, void *res) +{ + struct irq_generic_chip_devres *this = res; + + irq_remove_generic_chip(this->gc, this->msk, this->clr, this->set); +} + +/** + * devm_irq_setup_generic_chip - Setup a range of interrupts with a generic + * chip for a managed device + * + * @dev: Device to setup the generic chip for + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @flags: Flags for initialization + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Set up max. 32 interrupts starting from gc->irq_base. Note, this + * initializes all interrupts to the primary irq_chip_type and its + * associated handler. + */ +int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, + u32 msk, enum irq_gc_flags flags, + unsigned int clr, unsigned int set) +{ + struct irq_generic_chip_devres *dr; + + dr = devres_alloc(devm_irq_remove_generic_chip, + sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + irq_setup_generic_chip(gc, msk, flags, clr, set); + + dr->gc = gc; + dr->msk = msk; + dr->clr = clr; + dr->set = set; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip); #endif /* CONFIG_GENERIC_IRQ_CHIP */ -- cgit v1.2.3 From 3c85d6db5e5f05ae6c3d7f5a0ceceb43746a5ca7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Jun 2017 04:12:00 +0200 Subject: sched/loadavg: Generalize "_idle" naming to "_nohz" The loadavg naming code still assumes that nohz == idle whereas its code is actually handling well both nohz idle and nohz full. So lets fix the naming according to what the code actually does, to unconfuse the reader. Signed-off-by: Frederic Weisbecker Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1497838322-10913-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/loadavg.c | 51 ++++++++++++++++++++++++------------------------ kernel/time/tick-sched.c | 4 ++-- 2 files changed, 28 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f15fb2bdbc0d..f14716a3522f 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * load-average relies on per-cpu sampling from the tick, it is affected by * NO_HZ. * - * The basic idea is to fold the nr_active delta into a global idle-delta upon + * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon * entering NO_HZ state such that we can include this as an 'extra' cpu delta * when we read the global state. * @@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * - When we go NO_HZ idle during the window, we can negate our sample * contribution, causing under-accounting. * - * We avoid this by keeping two idle-delta counters and flipping them + * We avoid this by keeping two NO_HZ-delta counters and flipping them * when the window starts, thus separating old and new NO_HZ load. * * The only trick is the slight shift in index flip for read vs write. @@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * r:0 0 1 1 0 0 1 1 0 * w:0 1 1 0 0 1 1 0 0 * - * This ensures we'll fold the old idle contribution in this window while + * This ensures we'll fold the old NO_HZ contribution in this window while * accumlating the new one. * - * - When we wake up from NO_HZ idle during the window, we push up our + * - When we wake up from NO_HZ during the window, we push up our * contribution, since we effectively move our sample point to a known * busy state. * * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which + * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. + * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ + * intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_idle[2]; +static atomic_long_t calc_load_nohz[2]; static int calc_load_idx; static inline int calc_load_write_idx(void) @@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void) /* * If the folding window started, make sure we start writing in the - * next idle-delta. + * next NO_HZ-delta. */ if (!time_before(jiffies, READ_ONCE(calc_load_update))) idx++; @@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_enter_idle(void) +void calc_load_nohz_start(void) { struct rq *this_rq = this_rq(); long delta; /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. */ delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); + atomic_long_add(delta, &calc_load_nohz[idx]); } } -void calc_load_exit_idle(void) +void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -217,13 +217,13 @@ void calc_load_exit_idle(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_fold_idle(void) +static long calc_load_nohz_fold(void) { int idx = calc_load_read_idx(); long delta = 0; - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); + if (atomic_long_read(&calc_load_nohz[idx])) + delta = atomic_long_xchg(&calc_load_nohz[idx], 0); return delta; } @@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp, /* * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. + * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into + * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold + * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. * * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. @@ -330,7 +330,7 @@ static void calc_global_nohz(void) } /* - * Flip the idle index... + * Flip the NO_HZ index... * * Make sure we first write the new time then flip the index, so that * calc_load_write_idx() will see the new time when it reads the new @@ -341,7 +341,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_fold_idle(void) { return 0; } +static inline long calc_load_nohz_fold(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks) return; /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. + * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. */ - delta = calc_load_fold_idle(); + delta = calc_load_nohz_fold(); if (delta) atomic_long_add(delta, &calc_load_tasks); @@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks) WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + * In case we went to NO_HZ for multiple LOAD_FREQ intervals + * catch up in bulk. */ calc_global_nohz(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9c2dc64e31d8..b1b58a07e042 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -783,7 +783,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ if (!ts->tick_stopped) { nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); + calc_load_nohz_start(); cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -823,7 +823,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) */ timer_clear_idle(); - calc_load_exit_idle(); + calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.3 From a0db971e4eb69fc84eb3d7ef94f718b483550b4a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Jun 2017 04:12:01 +0200 Subject: nohz: Move idle balancer registration to the idle path The idle load balancing registration path assumes that we only stop the tick when the CPU is idle, ignoring the nohz full case. As a result, a nohz full CPU that is running a task may be chosen to perform idle load balancing. Lets make sure that only CPUs in dynticks idle mode can be picked as idle load balancers. Signed-off-by: Frederic Weisbecker Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1497838322-10913-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b1b58a07e042..db023e9cbb25 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -782,7 +782,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); calc_load_nohz_start(); cpu_load_update_nohz_start(); @@ -923,8 +922,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ts->idle_expires = expires; } - if (!was_stopped && ts->tick_stopped) + if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; + nohz_balance_enter_idle(cpu); + } } } -- cgit v1.2.3 From 387bc8b5536eeb0a92f4b4ab553539eaea2ac0ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Jun 2017 04:12:02 +0200 Subject: sched/fair: Spare idle load balancing on nohz_full CPUs Although idle load balancing obviously only concerns idle CPUs, it can be a disturbance on a busy nohz_full CPU. Indeed a CPU can only get rid of an idle load balancing duty once a tick fires while it runs a task and this can take a while on a nohz_full CPU. We could fix that and escape the idle load balancing duty from the very idle exit path but that would bring unecessary overhead. Lets just not bother and leave that job to housekeeping CPUs (those outside nohz_full range). The nohz_full CPUs simply don't want any disturbance. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1497838322-10913-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a24661ac3d23..694c258b8771 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8683,6 +8683,10 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ + if (!is_housekeeping_cpu(cpu)) + return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; -- cgit v1.2.3 From 0165308a2f994939d2e1b36624f5a8f57746bc88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:04 +0200 Subject: genirq/msi: Prevent overwriting domain name Prevent overwriting an already assigned domain name. Remove the extra check for chip->name, because if domain->name is NULL overwriting it with NULL is not a problem. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Jens Axboe Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235443.510684976@linutronix.de --- kernel/irq/msi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index fe4d48ec5bc4..9e3f1857c6bd 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -274,7 +274,8 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, fwnode, &msi_domain_ops, info); - if (domain && info->chip && info->chip->name) + + if (domain && !domain->name && info->chip) domain->name = info->chip->name; return domain; -- cgit v1.2.3 From d59f6617eef0f76e34f7a9993f5645c5ef467e42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:05 +0200 Subject: genirq: Allow fwnode to carry name information only In order to provide proper debug interface it's required to have domain names available when the domain is added. Non fwnode based architectures like x86 have no way to do so. It's not possible to use domain ops or host data for this as domain ops might be the same for several instances, but the names have to be unique. Extend the irqchip fwnode to allow transporting the domain name. If no node is supplied, create a 'unknown-N' placeholder. Warn if an invalid node is supplied and treat it like no node. This happens e.g. with i2 devices on x86 which hand in an ACPI type node which has no interface for retrieving the name. [ Folded a fix from Marc to make DT name parsing work ] Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Jens Axboe Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235443.588784933@linutronix.de --- kernel/irq/irqdomain.c | 105 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 70b9da72018b..e1b925bea205 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -26,39 +26,61 @@ static struct irq_domain *irq_default_domain; static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { - struct fwnode_handle fwnode; - char *name; - void *data; + struct fwnode_handle fwnode; + unsigned int type; + char *name; + void *data; }; /** * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain - * @data: optional user-provided data + * @type: Type of irqchip_fwnode. See linux/irqdomain.h + * @name: Optional user provided domain name + * @id: Optional user provided id if name != NULL + * @data: Optional user-provided data * - * Allocate a struct device_node, and return a poiner to the embedded + * Allocate a struct irqchip_fwid, and return a poiner to the embedded * fwnode_handle (or NULL on failure). + * + * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are + * solely to transport name information to irqdomain creation code. The + * node is not stored. For other types the pointer is kept in the irq + * domain struct. */ -struct fwnode_handle *irq_domain_alloc_fwnode(void *data) +struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, + const char *name, void *data) { struct irqchip_fwid *fwid; - char *name; + char *n; fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); - name = kasprintf(GFP_KERNEL, "irqchip@%p", data); - if (!fwid || !name) { + switch (type) { + case IRQCHIP_FWNODE_NAMED: + n = kasprintf(GFP_KERNEL, "%s", name); + break; + case IRQCHIP_FWNODE_NAMED_ID: + n = kasprintf(GFP_KERNEL, "%s-%d", name, id); + break; + default: + n = kasprintf(GFP_KERNEL, "irqchip@%p", data); + break; + } + + if (!fwid || !n) { kfree(fwid); - kfree(name); + kfree(n); return NULL; } - fwid->name = name; + fwid->type = type; + fwid->name = n; fwid->data = data; fwid->fwnode.type = FWNODE_IRQCHIP; return &fwid->fwnode; } -EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode); +EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); /** * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle @@ -97,20 +119,75 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, void *host_data) { struct device_node *of_node = to_of_node(fwnode); + struct irqchip_fwid *fwid; struct irq_domain *domain; + static atomic_t unknown_domains; + domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; + if (fwnode && is_fwnode_irqchip(fwnode)) { + fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + + switch (fwid->type) { + case IRQCHIP_FWNODE_NAMED: + case IRQCHIP_FWNODE_NAMED_ID: + domain->name = kstrdup(fwid->name, GFP_KERNEL); + if (!domain->name) { + kfree(domain); + return NULL; + } + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + break; + default: + domain->fwnode = fwnode; + domain->name = fwid->name; + break; + } + } else if (of_node) { + char *name; + + /* + * DT paths contain '/', which debugfs is legitimately + * unhappy about. Replace them with ':', which does + * the trick and is not as offensive as '\'... + */ + name = kstrdup(of_node_full_name(of_node), GFP_KERNEL); + if (!name) { + kfree(domain); + return NULL; + } + + strreplace(name, '/', ':'); + + domain->name = name; + domain->fwnode = fwnode; + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } + + if (!domain->name) { + if (fwnode) { + pr_err("Invalid fwnode type (%d) for irqdomain\n", + fwnode->type); + } + domain->name = kasprintf(GFP_KERNEL, "unknown-%d", + atomic_inc_return(&unknown_domains)); + if (!domain->name) { + kfree(domain); + return NULL; + } + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } + of_node_get(of_node); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; - domain->fwnode = fwnode; domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; @@ -152,6 +229,8 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); of_node_put(irq_domain_get_of_node(domain)); + if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) + kfree(domain->name); kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); -- cgit v1.2.3 From 9dc6be3d419398eae9a19cd09b7969ceff8eaf10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:16 +0200 Subject: genirq/irqdomain: Add map counter Add a map counter instead of counting radix tree entries for diagnosis. That also gives correct information for linear domains. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Jens Axboe Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.459397746@linutronix.de --- kernel/irq/irqdomain.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e1b925bea205..8d5805c655b6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -423,6 +423,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) irq_data->domain = NULL; irq_data->hwirq = 0; + domain->mapcount--; /* Clear reverse map for this hwirq */ if (hwirq < domain->revmap_size) { @@ -474,6 +475,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, domain->name = irq_data->chip->name; } + domain->mapcount++; if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = virq; } else { @@ -1081,6 +1083,7 @@ static void irq_domain_insert_irq(int virq) struct irq_domain *domain = data->domain; irq_hw_number_t hwirq = data->hwirq; + domain->mapcount++; if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = virq; } else { @@ -1110,6 +1113,7 @@ static void irq_domain_remove_irq(int virq) struct irq_domain *domain = data->domain; irq_hw_number_t hwirq = data->hwirq; + domain->mapcount--; if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { -- cgit v1.2.3 From 087cdfb662ae50e3826e7cd2e54b6519d07b60f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:17 +0200 Subject: genirq/debugfs: Add proper debugfs interface Debugging (hierarchical) interupt domains is tedious as there is no information about the hierarchy and no information about states of interrupts in the various domain levels. Add a debugfs directory 'irq' and subdirectories 'domains' and 'irqs'. The domains directory contains the domain files. The content is information about the domain. If the domain is part of a hierarchy then the parent domains are printed as well. # ls /sys/kernel/debug/irq/domains/ default INTEL-IR-2 INTEL-IR-MSI-2 IO-APIC-IR-2 PCI-MSI DMAR-MSI INTEL-IR-3 INTEL-IR-MSI-3 IO-APIC-IR-3 unknown-1 INTEL-IR-0 INTEL-IR-MSI-0 IO-APIC-IR-0 IO-APIC-IR-4 VECTOR INTEL-IR-1 INTEL-IR-MSI-1 IO-APIC-IR-1 PCI-HT # cat /sys/kernel/debug/irq/domains/VECTOR name: VECTOR size: 0 mapped: 216 flags: 0x00000041 # cat /sys/kernel/debug/irq/domains/IO-APIC-IR-0 name: IO-APIC-IR-0 size: 24 mapped: 19 flags: 0x00000041 parent: INTEL-IR-3 name: INTEL-IR-3 size: 65536 mapped: 167 flags: 0x00000041 parent: VECTOR name: VECTOR size: 0 mapped: 216 flags: 0x00000041 Unfortunately there is no per cpu information about the VECTOR domain (yet). The irqs directory contains detailed information about mapped interrupts. # cat /sys/kernel/debug/irq/irqs/3 handler: handle_edge_irq status: 0x00004000 istate: 0x00000000 ddepth: 1 wdepth: 0 dstate: 0x01018000 IRQD_IRQ_DISABLED IRQD_SINGLE_TARGET IRQD_MOVE_PCNTXT node: 0 affinity: 0-143 effectiv: 0 pending: domain: IO-APIC-IR-0 hwirq: 0x3 chip: IR-IO-APIC flags: 0x10 IRQCHIP_SKIP_SET_WAKE parent: domain: INTEL-IR-3 hwirq: 0x20000 chip: INTEL-IR flags: 0x0 parent: domain: VECTOR hwirq: 0x3 chip: APIC flags: 0x0 This was developed to simplify the debugging of the managed affinity changes. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Jens Axboe Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.537566163@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 11 +++ kernel/irq/Makefile | 1 + kernel/irq/debugfs.c | 215 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/internals.h | 22 +++++ kernel/irq/irqdesc.c | 1 + kernel/irq/irqdomain.c | 87 +++++++++++++++++++- kernel/irq/manage.c | 1 + 7 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 kernel/irq/debugfs.c (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3bbfd6a9c475..8d9498e51585 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -108,4 +108,15 @@ config SPARSE_IRQ If you don't know what to do here, say N. +config GENERIC_IRQ_DEBUGFS + bool "Expose irq internals in debugfs" + depends on DEBUG_FS + default n + ---help--- + + Exposes internal state information through debugfs. Mostly for + developers and debugging of hard to diagnose interrupt problems. + + If you don't know what to do here, say N. + endmenu diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1d3ee3169202..c61fc9c2d1f7 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o obj-$(CONFIG_SMP) += affinity.o +obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c new file mode 100644 index 000000000000..50ee2f6593e8 --- /dev/null +++ b/kernel/irq/debugfs.c @@ -0,0 +1,215 @@ +/* + * Copyright 2017 Thomas Gleixner + * + * This file is licensed under the GPL V2. + */ +#include +#include +#include + +#include "internals.h" + +static struct dentry *irq_dir; + +struct irq_bit_descr { + unsigned int mask; + char *name; +}; +#define BIT_MASK_DESCR(m) { .mask = m, .name = #m } + +static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, + const struct irq_bit_descr *sd, int size) +{ + int i; + + for (i = 0; i < size; i++, sd++) { + if (state & sd->mask) + seq_printf(m, "%*s%s\n", ind + 12, "", sd->name); + } +} + +#ifdef CONFIG_SMP +static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct cpumask *msk; + + msk = irq_data_get_affinity_mask(data); + seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk)); +#ifdef CONFIG_GENERIC_PENDING_IRQ + msk = desc->pending_mask; + seq_printf(m, "pending: %*pbl\n", cpumask_pr_args(msk)); +#endif +} +#else +static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) { } +#endif + +static const struct irq_bit_descr irqchip_flags[] = { + BIT_MASK_DESCR(IRQCHIP_SET_TYPE_MASKED), + BIT_MASK_DESCR(IRQCHIP_EOI_IF_HANDLED), + BIT_MASK_DESCR(IRQCHIP_MASK_ON_SUSPEND), + BIT_MASK_DESCR(IRQCHIP_ONOFFLINE_ENABLED), + BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE), + BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE), + BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), +}; + +static void +irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind) +{ + struct irq_chip *chip = data->chip; + + if (!chip) { + seq_printf(m, "chip: None\n"); + return; + } + seq_printf(m, "%*schip: %s\n", ind, "", chip->name); + seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags); + irq_debug_show_bits(m, ind, chip->flags, irqchip_flags, + ARRAY_SIZE(irqchip_flags)); +} + +static void +irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind) +{ + seq_printf(m, "%*sdomain: %s\n", ind, "", + data->domain ? data->domain->name : ""); + seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq); + irq_debug_show_chip(m, data, ind + 1); +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + if (!data->parent_data) + return; + seq_printf(m, "%*sparent:\n", ind + 1, ""); + irq_debug_show_data(m, data->parent_data, ind + 4); +#endif +} + +static const struct irq_bit_descr irqdata_states[] = { + BIT_MASK_DESCR(IRQ_TYPE_EDGE_RISING), + BIT_MASK_DESCR(IRQ_TYPE_EDGE_FALLING), + BIT_MASK_DESCR(IRQ_TYPE_LEVEL_HIGH), + BIT_MASK_DESCR(IRQ_TYPE_LEVEL_LOW), + BIT_MASK_DESCR(IRQD_LEVEL), + + BIT_MASK_DESCR(IRQD_ACTIVATED), + BIT_MASK_DESCR(IRQD_IRQ_STARTED), + BIT_MASK_DESCR(IRQD_IRQ_DISABLED), + BIT_MASK_DESCR(IRQD_IRQ_MASKED), + BIT_MASK_DESCR(IRQD_IRQ_INPROGRESS), + + BIT_MASK_DESCR(IRQD_PER_CPU), + BIT_MASK_DESCR(IRQD_NO_BALANCING), + + BIT_MASK_DESCR(IRQD_MOVE_PCNTXT), + BIT_MASK_DESCR(IRQD_AFFINITY_SET), + BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), + BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), + BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), + + BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), + + BIT_MASK_DESCR(IRQD_WAKEUP_STATE), + BIT_MASK_DESCR(IRQD_WAKEUP_ARMED), +}; + +static const struct irq_bit_descr irqdesc_states[] = { + BIT_MASK_DESCR(_IRQ_NOPROBE), + BIT_MASK_DESCR(_IRQ_NOREQUEST), + BIT_MASK_DESCR(_IRQ_NOTHREAD), + BIT_MASK_DESCR(_IRQ_NOAUTOEN), + BIT_MASK_DESCR(_IRQ_NESTED_THREAD), + BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID), + BIT_MASK_DESCR(_IRQ_IS_POLLED), + BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY), +}; + +static const struct irq_bit_descr irqdesc_istates[] = { + BIT_MASK_DESCR(IRQS_AUTODETECT), + BIT_MASK_DESCR(IRQS_SPURIOUS_DISABLED), + BIT_MASK_DESCR(IRQS_POLL_INPROGRESS), + BIT_MASK_DESCR(IRQS_ONESHOT), + BIT_MASK_DESCR(IRQS_REPLAY), + BIT_MASK_DESCR(IRQS_WAITING), + BIT_MASK_DESCR(IRQS_PENDING), + BIT_MASK_DESCR(IRQS_SUSPENDED), +}; + + +static int irq_debug_show(struct seq_file *m, void *p) +{ + struct irq_desc *desc = m->private; + struct irq_data *data; + + raw_spin_lock_irq(&desc->lock); + data = irq_desc_get_irq_data(desc); + seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); + irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, + ARRAY_SIZE(irqdesc_states)); + seq_printf(m, "istate: 0x%08x\n", desc->istate); + irq_debug_show_bits(m, 0, desc->istate, irqdesc_istates, + ARRAY_SIZE(irqdesc_istates)); + seq_printf(m, "ddepth: %u\n", desc->depth); + seq_printf(m, "wdepth: %u\n", desc->wake_depth); + seq_printf(m, "dstate: 0x%08x\n", irqd_get(data)); + irq_debug_show_bits(m, 0, irqd_get(data), irqdata_states, + ARRAY_SIZE(irqdata_states)); + seq_printf(m, "node: %d\n", irq_data_get_node(data)); + irq_debug_show_masks(m, desc); + irq_debug_show_data(m, data, 0); + raw_spin_unlock_irq(&desc->lock); + return 0; +} + +static int irq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_debug_show, inode->i_private); +} + +static const struct file_operations dfs_irq_ops = { + .open = irq_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) +{ + char name [10]; + + if (!irq_dir || !desc || desc->debugfs_file) + return; + + sprintf(name, "%d", irq); + desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc, + &dfs_irq_ops); +} + +void irq_remove_debugfs_entry(struct irq_desc *desc) +{ + if (desc->debugfs_file) + debugfs_remove(desc->debugfs_file); +} + +static int __init irq_debugfs_init(void) +{ + struct dentry *root_dir; + int irq; + + root_dir = debugfs_create_dir("irq", NULL); + if (!root_dir) + return -ENOMEM; + + irq_domain_debugfs_init(root_dir); + + irq_dir = debugfs_create_dir("irqs", root_dir); + + irq_lock_sparse(); + for_each_active_irq(irq) + irq_add_debugfs_entry(irq, irq_to_desc(irq)); + irq_unlock_sparse(); + + return 0; +} +__initcall(irq_debugfs_init); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 921a2419720c..094db5bfb83f 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -169,6 +169,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) +static inline unsigned int irqd_get(struct irq_data *d) +{ + return __irqd_to_state(d); +} + /* * Manipulation functions for irq_data.state */ @@ -237,3 +242,20 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler) { } #endif /* CONFIG_GENERIC_IRQ_CHIP */ + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); +void irq_remove_debugfs_entry(struct irq_desc *desc); +# ifdef CONFIG_IRQ_DOMAIN +void irq_domain_debugfs_init(struct dentry *root); +# else +static inline void irq_domain_debugfs_init(struct dentry *root); +# endif +#else /* CONFIG_GENERIC_IRQ_DEBUGFS */ +static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) +{ +} +static inline void irq_remove_debugfs_entry(struct irq_desc *d) +{ +} +#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 09abce2ea8f0..feade536b6d1 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -394,6 +394,7 @@ static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + irq_remove_debugfs_entry(desc); unregister_irq_proc(irq, desc); /* diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8d5805c655b6..75e1f0851c33 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -29,9 +29,17 @@ struct irqchip_fwid { struct fwnode_handle fwnode; unsigned int type; char *name; - void *data; + void *data; }; +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void debugfs_add_domain_dir(struct irq_domain *d); +static void debugfs_remove_domain_dir(struct irq_domain *d); +#else +static inline void debugfs_add_domain_dir(struct irq_domain *d) { } +static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } +#endif + /** * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain @@ -194,6 +202,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, irq_domain_check_hierarchy(domain); mutex_lock(&irq_domain_mutex); + debugfs_add_domain_dir(domain); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); @@ -213,6 +222,7 @@ EXPORT_SYMBOL_GPL(__irq_domain_add); void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); + debugfs_remove_domain_dir(domain); WARN_ON(!radix_tree_empty(&domain->revmap_tree)); @@ -1599,3 +1609,78 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) { } #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static struct dentry *domain_dir; + +static void +irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) +{ + seq_printf(m, "%*sname: %s\n", ind, "", d->name); + seq_printf(m, "%*ssize: %u\n", ind + 1, "", + d->revmap_size + d->revmap_direct_max_irq); + seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); + seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + if (!d->parent) + return; + seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name); + irq_domain_debug_show_one(m, d->parent, ind + 4); +#endif +} + +static int irq_domain_debug_show(struct seq_file *m, void *p) +{ + struct irq_domain *d = m->private; + + /* Default domain? Might be NULL */ + if (!d) { + if (!irq_default_domain) + return 0; + d = irq_default_domain; + } + irq_domain_debug_show_one(m, d, 0); + return 0; +} + +static int irq_domain_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_domain_debug_show, inode->i_private); +} + +static const struct file_operations dfs_domain_ops = { + .open = irq_domain_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void debugfs_add_domain_dir(struct irq_domain *d) +{ + if (!d->name || !domain_dir || d->debugfs_file) + return; + d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, + &dfs_domain_ops); +} + +static void debugfs_remove_domain_dir(struct irq_domain *d) +{ + if (d->debugfs_file) + debugfs_remove(d->debugfs_file); +} + +void __init irq_domain_debugfs_init(struct dentry *root) +{ + struct irq_domain *d; + + domain_dir = debugfs_create_dir("domains", root); + if (!domain_dir) + return; + + debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); + mutex_lock(&irq_domain_mutex); + list_for_each_entry(d, &irq_domain_list, link) + debugfs_add_domain_dir(d); + mutex_unlock(&irq_domain_mutex); +} +#endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4c34696ca575..284f4eb1ffbe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1398,6 +1398,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) wake_up_process(new->secondary->thread); register_irq_proc(irq, desc); + irq_add_debugfs_entry(irq, desc); new->dir = NULL; register_handler_proc(irq, new); free_cpumask_var(mask); -- cgit v1.2.3 From cdd16365b0bd7c0cd19e2cc768b6bdc8021f32c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:19 +0200 Subject: genirq: Provide irq_fixup_move_pending() If an CPU goes offline, the interrupts are migrated away, but a eventually pending interrupt move, which has not yet been made effective is kept pending even if the outgoing CPU is the sole target of the pending affinity mask. What's worse is, that the pending affinity mask is discarded even if it would contain a valid subset of the online CPUs. Implement a helper function which allows to avoid these issues. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.691345468@linutronix.de --- kernel/irq/migration.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 37ddb7bda651..6ca054a3f91d 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,6 +4,36 @@ #include "internals.h" +/** + * irq_fixup_move_pending - Cleanup irq move pending from a dying CPU + * @desc: Interrupt descpriptor to clean up + * @force_clear: If set clear the move pending bit unconditionally. + * If not set, clear it only when the dying CPU is the + * last one in the pending mask. + * + * Returns true if the pending bit was set and the pending mask contains an + * online CPU other than the dying CPU. + */ +bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + + if (!irqd_is_setaffinity_pending(data)) + return false; + + /* + * The outgoing CPU might be the last online target in a pending + * interrupt move. If that's the case clear the pending move bit. + */ + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) { + irqd_clr_move_pending(data); + return false; + } + if (force_clear) + irqd_clr_move_pending(data); + return true; +} + void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); -- cgit v1.2.3 From cba4235e6031e9318d68186f6d765c531cbea4e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:21 +0200 Subject: genirq: Remove mask argument from setup_affinity() No point to have this alloc/free dance of cpumasks. Provide a static mask for setup_affinity() and protect it proper. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.851571573@linutronix.de --- kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 53 ++++++++++++++++++++++---------------------------- kernel/irq/proc.c | 8 +++++--- 3 files changed, 29 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 094db5bfb83f..33ca83816b8c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -109,7 +109,7 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); +extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 284f4eb1ffbe..e2f20d553d60 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -345,15 +345,18 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) +static int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; - int node = irq_desc_get_node(desc); + int ret, node = irq_desc_get_node(desc); + static DEFINE_RAW_SPINLOCK(mask_lock); + static struct cpumask mask; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!__irq_can_set_affinity(desc)) return 0; + raw_spin_lock(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. @@ -367,43 +370,42 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } - cpumask_and(mask, cpu_online_mask, set); + cpumask_and(&mask, cpu_online_mask, set); if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); /* make sure at least one of the cpus in nodemask is online */ - if (cpumask_intersects(mask, nodemask)) - cpumask_and(mask, mask, nodemask); + if (cpumask_intersects(&mask, nodemask)) + cpumask_and(&mask, &mask, nodemask); } - irq_do_set_affinity(&desc->irq_data, mask, false); - return 0; + ret = irq_do_set_affinity(&desc->irq_data, &mask, false); + raw_spin_unlock(&mask_lock); + return ret; } #else /* Wrapper for ALPHA specific affinity selector magic */ -static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask) +int irq_setup_affinity(struct irq_desc *desc) { - return irq_select_affinity(irq_desc_get_irq(d)); + return irq_select_affinity(irq_desc_get_irq(desc)); } #endif /* - * Called when affinity is set via /proc/irq + * Called when a bogus affinity is set via /proc/irq */ -int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) +int irq_select_affinity_usr(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(desc, mask); + ret = irq_setup_affinity(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } - #else -static inline int -setup_affinity(struct irq_desc *desc, struct cpumask *mask) +static inline int setup_affinity(struct irq_desc *desc) { return 0; } @@ -1128,7 +1130,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; - cpumask_var_t mask; if (!desc) return -EINVAL; @@ -1187,11 +1188,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } } - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out_thread; - } - /* * Drivers are often written to work w/o knowledge about the * underlying irq chip implementation, so a request for a @@ -1256,7 +1252,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (thread_mask == ~0UL) { ret = -EBUSY; - goto out_mask; + goto out_unlock; } /* * The thread_mask for the action is or'ed to @@ -1300,7 +1296,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", irq); ret = -EINVAL; - goto out_mask; + goto out_unlock; } if (!shared) { @@ -1308,7 +1304,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); - goto out_mask; + goto out_unlock; } init_waitqueue_head(&desc->wait_for_threads); @@ -1320,7 +1316,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (ret) { irq_release_resources(desc); - goto out_mask; + goto out_unlock; } } @@ -1357,7 +1353,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* Set default affinity mask once everything is setup */ - setup_affinity(desc, mask); + irq_setup_affinity(desc); } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; @@ -1401,8 +1397,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq_add_debugfs_entry(irq, desc); new->dir = NULL; register_handler_proc(irq, new); - free_cpumask_var(mask); - return 0; mismatch: @@ -1415,9 +1409,8 @@ mismatch: } ret = -EBUSY; -out_mask: +out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); - free_cpumask_var(mask); out_thread: if (new->thread) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c53edad7b459..d35bb8d4c317 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -120,9 +120,11 @@ static ssize_t write_irq_affinity(int type, struct file *file, * one online CPU still has to be targeted. */ if (!cpumask_intersects(new_value, cpu_online_mask)) { - /* Special case for empty set - allow the architecture - code to set default SMP affinity. */ - err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; + /* + * Special case for empty set - allow the architecture code + * to set default SMP affinity. + */ + err = irq_select_affinity_usr(irq) ? -EINVAL : count; } else { irq_set_affinity(irq, new_value); err = count; -- cgit v1.2.3 From 43564bd97d0e6182bbd43b51b33254c728832551 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:22 +0200 Subject: genirq: Rename setup_affinity() to irq_setup_affinity() Rename it with a proper irq_ prefix and make it available for other files in the core code. Preparatory patch for moving the irq affinity setup around. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.928501004@linutronix.de --- kernel/irq/internals.h | 6 ++++++ kernel/irq/manage.c | 7 +------ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 33ca83816b8c..2d7927d9fb57 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -116,6 +116,12 @@ extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); +#ifdef CONFIG_SMP +extern int irq_setup_affinity(struct irq_desc *desc); +#else +static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } +#endif + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e2f20d553d60..907fb791ff63 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int irq_setup_affinity(struct irq_desc *desc) +int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; int ret, node = irq_desc_get_node(desc); @@ -404,11 +404,6 @@ int irq_select_affinity_usr(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } -#else -static inline int setup_affinity(struct irq_desc *desc) -{ - return 0; -} #endif /** -- cgit v1.2.3 From 2e051552df69af6d134c2592d0d6f1ac80f01190 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:23 +0200 Subject: genirq: Move initial affinity setup to irq_startup() The startup vs. setaffinity ordering of interrupts depends on the IRQF_NOAUTOEN flag. Chained interrupts are not getting any affinity assignment at all. A regular interrupt is started up and then the affinity is set. A IRQF_NOAUTOEN marked interrupt is not started up, but the affinity is set nevertheless. Move the affinity setup to startup_irq() so the ordering is always the same and chained interrupts get the proper default affinity assigned as well. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.020534783@linutronix.de --- kernel/irq/chip.c | 2 ++ kernel/irq/manage.c | 15 ++++++--------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index bc1331f84fb5..e290d73b88e2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -213,6 +213,8 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_enable(desc); } irq_state_set_started(desc); + /* Set default affinity mask once everything is setup */ + irq_setup_affinity(desc); } if (resend) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 907fb791ff63..1e283073cecc 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1327,6 +1327,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; + /* Exclude IRQ from balancing if requested */ + if (new->flags & IRQF_NOBALANCING) { + irq_settings_set_no_balancing(desc); + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + } + if (irq_settings_can_autoenable(desc)) { irq_startup(desc, true); } else { @@ -1341,15 +1347,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->depth = 1; } - /* Exclude IRQ from balancing if requested */ - if (new->flags & IRQF_NOBALANCING) { - irq_settings_set_no_balancing(desc); - irqd_set(&desc->irq_data, IRQD_NO_BALANCING); - } - - /* Set default affinity mask once everything is setup */ - irq_setup_affinity(desc); - } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); -- cgit v1.2.3 From 137221df69c6f8a7002f82dc3d95052d34f5667e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Jun 2017 01:37:24 +0200 Subject: genirq: Move pending helpers to internal.h So that the affinity code can reuse them. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170619235445.109426284@linutronix.de --- kernel/irq/internals.h | 38 ++++++++++++++++++++++++++++++++++++++ kernel/irq/manage.c | 28 ---------------------------- 2 files changed, 38 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 2d7927d9fb57..20b197f0a7b5 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -249,6 +249,44 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, void __iomem *reg_base, irq_flow_handler_t handler) { } #endif /* CONFIG_GENERIC_IRQ_CHIP */ +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool irq_can_move_pcntxt(struct irq_data *data) +{ + return irqd_can_move_in_process_context(data); +} +static inline bool irq_move_pending(struct irq_data *data) +{ + return irqd_is_setaffinity_pending(data); +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ + cpumask_copy(desc->pending_mask, mask); +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ + cpumask_copy(mask, desc->pending_mask); +} +#else /* CONFIG_GENERIC_PENDING_IRQ */ +static inline bool irq_can_move_pcntxt(struct irq_data *data) +{ + return true; +} +static inline bool irq_move_pending(struct irq_data *data) +{ + return false; +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ +} +#endif /* CONFIG_GENERIC_PENDING_IRQ */ + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); void irq_remove_debugfs_entry(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1e283073cecc..7dcf19397c39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -168,34 +168,6 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } -#ifdef CONFIG_GENERIC_PENDING_IRQ -static inline bool irq_can_move_pcntxt(struct irq_data *data) -{ - return irqd_can_move_in_process_context(data); -} -static inline bool irq_move_pending(struct irq_data *data) -{ - return irqd_is_setaffinity_pending(data); -} -static inline void -irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) -{ - cpumask_copy(desc->pending_mask, mask); -} -static inline void -irq_get_pending(struct cpumask *mask, struct irq_desc *desc) -{ - cpumask_copy(mask, desc->pending_mask); -} -#else -static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } -static inline bool irq_move_pending(struct irq_data *data) { return false; } -static inline void -irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } -static inline void -irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } -#endif - int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { -- cgit v1.2.3 From 0dd945ff4647a1f29c6ae8f4f9a69c8f37c994cf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:25 +0200 Subject: genirq/cpuhotplug: Remove irq disabling logic This is called from stop_machine() with interrupts disabled. No point in disabling them some more. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.198042748@linutronix.de --- kernel/irq/cpuhotplug.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..705139831590 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -59,11 +59,8 @@ static bool migrate_one_irq(struct irq_desc *desc) */ void irq_migrate_all_off_this_cpu(void) { - unsigned int irq; struct irq_desc *desc; - unsigned long flags; - - local_irq_save(flags); + unsigned int irq; for_each_active_irq(irq) { bool affinity_broken; @@ -73,10 +70,9 @@ void irq_migrate_all_off_this_cpu(void) affinity_broken = migrate_one_irq(desc); raw_spin_unlock(&desc->lock); - if (affinity_broken) - pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", + if (affinity_broken) { + pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n", irq, smp_processor_id()); + } } - - local_irq_restore(flags); } -- cgit v1.2.3 From 735c09524d3e7c92315e8e2699a1b9acb4fb415c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:26 +0200 Subject: genirq/cpuhotplug: Dont claim success on error In case the affinity of an interrupt was broken, a printk is emitted. But if the affinity cannot be set at all due to a missing irq_set_affinity() callback or due to a failing callback, the message is still printed preceeded by a warning/error. That makes no sense whatsoever. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.274852976@linutronix.de --- kernel/irq/cpuhotplug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 705139831590..9c5521b247d5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -37,11 +37,14 @@ static bool migrate_one_irq(struct irq_desc *desc) c = irq_data_get_irq_chip(d); if (!c->irq_set_affinity) { pr_debug("IRQ%u: unable to set affinity\n", d->irq); + ret = false; } else { int r = irq_do_set_affinity(d, affinity, false); - if (r) + if (r) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, r); + ret = false; + } } return ret; -- cgit v1.2.3 From e8a7035039306c90bcc99129ffc18e0be052bbb9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:27 +0200 Subject: genirq/cpuhotplug: Reorder check logic Move the checks for a valid irq chip and the irq_set_affinity() callback right in front of the whole migration logic. No point in doing a gazillion of other things when the interrupt cannot be migrated at all. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.354181630@linutronix.de --- kernel/irq/cpuhotplug.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 9c5521b247d5..41fe1e04d5d9 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -17,9 +17,20 @@ static bool migrate_one_irq(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(d); const struct cpumask *affinity = d->common->affinity; - struct irq_chip *c; - bool ret = false; + bool brokeaff = false; + int err; + + /* + * IRQ chip might be already torn down, but the irq descriptor is + * still in the radix tree. Also if the chip has no affinity setter, + * nothing can be done here. + */ + if (!chip || !chip->irq_set_affinity) { + pr_debug("IRQ %u: Unable to migrate away\n", d->irq); + return false; + } /* * If this is a per-CPU interrupt, or the affinity does not @@ -31,23 +42,16 @@ static bool migrate_one_irq(struct irq_desc *desc) if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { affinity = cpu_online_mask; - ret = true; + brokeaff = true; } - c = irq_data_get_irq_chip(d); - if (!c->irq_set_affinity) { - pr_debug("IRQ%u: unable to set affinity\n", d->irq); - ret = false; - } else { - int r = irq_do_set_affinity(d, affinity, false); - if (r) { - pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", - d->irq, r); - ret = false; - } + err = irq_do_set_affinity(d, affinity, false); + if (err) { + pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", + d->irq, err); + return false; } - - return ret; + return brokeaff; } /** -- cgit v1.2.3 From 91f26cb4cd3c22bd656ab46c49329aacaaab5504 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:28 +0200 Subject: genirq/cpuhotplug: Do not migrated shutdown irqs Interrupts, which are shut down are tried to be migrated as well. That's pointless because the interrupt cannot fire and the next startup will move it to the proper place anyway. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.447550992@linutronix.de --- kernel/irq/cpuhotplug.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 41fe1e04d5d9..09b20e127aee 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -33,10 +33,15 @@ static bool migrate_one_irq(struct irq_desc *desc) } /* - * If this is a per-CPU interrupt, or the affinity does not - * include this CPU, then we have nothing to do. + * No move required, if: + * - Interrupt is per cpu + * - Interrupt is not started + * - Affinity mask does not include this CPU. + * + * Note: Do not check desc->action as this might be a chained + * interrupt. */ - if (irqd_is_per_cpu(d) || + if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !cpumask_test_cpu(smp_processor_id(), affinity)) return false; -- cgit v1.2.3 From f0383c24b4855f6a4b5a358c7b2d2c16e0437e9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:29 +0200 Subject: genirq/cpuhotplug: Add support for cleaning up move in progress In order to move x86 to the generic hotplug migration code, add support for cleaning up move in progress bits. On architectures which have this x86 specific (mis)feature not enabled, this is optimized out by the compiler. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.525817311@linutronix.de --- kernel/irq/cpuhotplug.c | 28 ++++++++++++++++++++++++++-- kernel/irq/internals.h | 10 +++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 09b20e127aee..4be4bd669d81 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -18,7 +18,7 @@ static bool migrate_one_irq(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *chip = irq_data_get_irq_chip(d); - const struct cpumask *affinity = d->common->affinity; + const struct cpumask *affinity; bool brokeaff = false; int err; @@ -41,9 +41,33 @@ static bool migrate_one_irq(struct irq_desc *desc) * Note: Do not check desc->action as this might be a chained * interrupt. */ + affinity = irq_data_get_affinity_mask(d); if (irqd_is_per_cpu(d) || !irqd_is_started(d) || - !cpumask_test_cpu(smp_processor_id(), affinity)) + !cpumask_test_cpu(smp_processor_id(), affinity)) { + /* + * If an irq move is pending, abort it if the dying CPU is + * the sole target. + */ + irq_fixup_move_pending(desc, false); return false; + } + + /* + * Complete an eventually pending irq move cleanup. If this + * interrupt was moved in hard irq context, then the vectors need + * to be cleaned up. It can't wait until this interrupt actually + * happens and this CPU was involved. + */ + irq_force_complete_move(desc); + + /* + * If there is a setaffinity pending, then try to reuse the pending + * mask, so the last change of the affinity does not get lost. If + * there is no move pending or the pending mask does not contain + * any online CPU, use the current affinity mask. + */ + if (irq_fixup_move_pending(desc, true)) + affinity = irq_desc_get_pending_mask(desc); if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { affinity = cpu_online_mask; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 20b197f0a7b5..fd4fa8382b8f 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -268,6 +268,10 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { cpumask_copy(mask, desc->pending_mask); } +static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) +{ + return desc->pending_mask; +} #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -285,7 +289,11 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } -#endif /* CONFIG_GENERIC_PENDING_IRQ */ +static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) +{ + return NULL; +} +#endif /* !CONFIG_GENERIC_PENDING_IRQ */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); -- cgit v1.2.3 From 47a06d3a783217acae02976f15ca07ddc1ac024f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:30 +0200 Subject: genirq/cpuhotplug: Add support for conditional masking Interrupts which cannot be migrated in process context, need to be masked before the affinity is changed forcefully. Add support for that. Will be compiled out for architectures which do not have this x86 specific issue. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.604565591@linutronix.de --- kernel/irq/cpuhotplug.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 4be4bd669d81..6f46587a9ce5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -18,6 +18,7 @@ static bool migrate_one_irq(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *chip = irq_data_get_irq_chip(d); + bool maskchip = !irq_can_move_pcntxt(d) && !irqd_irq_masked(d); const struct cpumask *affinity; bool brokeaff = false; int err; @@ -69,6 +70,10 @@ static bool migrate_one_irq(struct irq_desc *desc) if (irq_fixup_move_pending(desc, true)) affinity = irq_desc_get_pending_mask(desc); + /* Mask the chip for interrupts which cannot move in process context */ + if (maskchip && chip->irq_mask) + chip->irq_mask(d); + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { affinity = cpu_online_mask; brokeaff = true; @@ -78,8 +83,12 @@ static bool migrate_one_irq(struct irq_desc *desc) if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); - return false; + brokeaff = false; } + + if (maskchip && chip->irq_unmask) + chip->irq_unmask(d); + return brokeaff; } -- cgit v1.2.3 From 77f85e66aa8be563ae5804eebf74a78ec6ef5555 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:31 +0200 Subject: genirq/cpuhotplug: Set force affinity flag on hotplug migration Set the force migration flag when migrating interrupts away from an outgoing CPU. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.681874648@linutronix.de --- kernel/irq/cpuhotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 6f46587a9ce5..e09cb91a7c8b 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -79,7 +79,7 @@ static bool migrate_one_irq(struct irq_desc *desc) brokeaff = true; } - err = irq_do_set_affinity(d, affinity, false); + err = irq_do_set_affinity(d, affinity, true); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); -- cgit v1.2.3 From 36d84fb45140f151fa4e145381dbce5e5ffed24d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:34 +0200 Subject: genirq: Move irq_fixup_move_pending() to core Now that x86 uses the generic code, the function declaration and inline stub can move to the core internal header. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.928156166@linutronix.de --- kernel/irq/internals.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fd4fa8382b8f..040806f1124c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -272,6 +272,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return desc->pending_mask; } +bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -293,6 +294,10 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return NULL; } +static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) +{ + return false; +} #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS -- cgit v1.2.3 From 047dc6331de58da51818582c0db0dbfcb837e614 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:35 +0200 Subject: genirq: Remove pointless arg from show_irq_affinity The third argument of the internal helper function is unused. Remove it. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.004958600@linutronix.de --- kernel/irq/proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d35bb8d4c317..eff7c0c8f9b9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -37,7 +37,7 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -static int show_irq_affinity(int type, struct seq_file *m, void *v) +static int show_irq_affinity(int type, struct seq_file *m) { struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask = desc->irq_common_data.affinity; @@ -80,12 +80,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(0, m, v); + return show_irq_affinity(0, m); } static int irq_affinity_list_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(1, m, v); + return show_irq_affinity(1, m); } -- cgit v1.2.3 From 4ab764c336123157690ee0000a1dcf81851c58d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:36 +0200 Subject: genirq: Remove pointless gfp argument All callers hand in GPF_KERNEL. No point to have an extra argument for that. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.082544752@linutronix.de --- kernel/irq/irqdesc.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index feade536b6d1..48d4f0365e52 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -54,14 +54,14 @@ static void __init init_irq_default_affinity(void) #endif #ifdef CONFIG_SMP -static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) +static int alloc_masks(struct irq_desc *desc, int node) { if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, - gfp, node)) + GFP_KERNEL, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { + if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } @@ -86,7 +86,7 @@ static void desc_smp_init(struct irq_desc *desc, int node, #else static inline int -alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } +alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } #endif @@ -344,9 +344,8 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, struct module *owner) { struct irq_desc *desc; - gfp_t gfp = GFP_KERNEL; - desc = kzalloc_node(sizeof(*desc), gfp, node); + desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; /* allocate based on nr_cpu_ids */ @@ -354,7 +353,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, if (!desc->kstat_irqs) goto err_desc; - if (alloc_masks(desc, gfp, node)) + if (alloc_masks(desc, node)) goto err_kstat; raw_spin_lock_init(&desc->lock); @@ -525,7 +524,7 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].kstat_irqs = alloc_percpu(unsigned int); - alloc_masks(&desc[i], GFP_KERNEL, node); + alloc_masks(&desc[i], node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); desc_set_defaults(i, &desc[i], node, NULL, NULL); -- cgit v1.2.3 From c1a80386965e9fa3c2f8d1d57966216fe02c9124 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:37 +0200 Subject: genirq/proc: Replace ever repeating type cast The proc file setup repeats the same ugly type cast for the irq number over and over. Do it once and hand in the local void pointer. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.160866358@linutronix.de --- kernel/irq/proc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index eff7c0c8f9b9..cbc4c5e377ec 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -326,6 +326,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { static DEFINE_MUTEX(register_lock); + void __maybe_unused *irqp = (void *)(unsigned long) irq; char name [MAX_NAMELEN]; if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) @@ -351,20 +352,19 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ proc_create_data("smp_affinity", 0644, desc->dir, - &irq_affinity_proc_fops, (void *)(long)irq); + &irq_affinity_proc_fops, irqp); /* create /proc/irq//affinity_hint */ proc_create_data("affinity_hint", 0444, desc->dir, - &irq_affinity_hint_proc_fops, (void *)(long)irq); + &irq_affinity_hint_proc_fops, irqp); /* create /proc/irq//smp_affinity_list */ proc_create_data("smp_affinity_list", 0644, desc->dir, - &irq_affinity_list_proc_fops, (void *)(long)irq); + &irq_affinity_list_proc_fops, irqp); proc_create_data("node", 0444, desc->dir, - &irq_node_proc_fops, (void *)(long)irq); + &irq_node_proc_fops, irqp); #endif - proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); -- cgit v1.2.3 From 0d3f54257dc300f2db480d6a46b34bdb87f18c1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:38 +0200 Subject: genirq: Introduce effective affinity mask There is currently no way to evaluate the effective affinity mask of a given interrupt. Many irq chips allow only a single target CPU or a subset of CPUs in the affinity mask. Updating the mask at the time of setting the affinity to the subset would be counterproductive because information for cpu hotplug about assigned interrupt affinities gets lost. On CPU hotplug it's also pointless to force migrate an interrupt, which is not targeted at the CPU effectively. But currently the information is not available. Provide a seperate mask to be updated by the irq_chip->irq_set_affinity() implementations. Implement the read only proc files so the user can see the effective mask as well w/o trying to deduce it from /proc/interrupts. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.247834245@linutronix.de --- kernel/irq/Kconfig | 4 +++ kernel/irq/debugfs.c | 4 +++ kernel/irq/irqdesc.c | 14 ++++++++ kernel/irq/proc.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 105 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8d9498e51585..fcbb1d6d51cb 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -21,6 +21,10 @@ config GENERIC_IRQ_SHOW config GENERIC_IRQ_SHOW_LEVEL bool +# Supports effective affinity mask +config GENERIC_IRQ_EFFECTIVE_AFF_MASK + bool + # Facility to allocate a hardware interrupt. This is legacy support # and should not be used in new code. Use irq domains instead. config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 50ee2f6593e8..edbef252d0c4 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -36,6 +36,10 @@ static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) msk = irq_data_get_affinity_mask(data); seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk)); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + msk = irq_data_get_effective_affinity_mask(data); + seq_printf(m, "effectiv: %*pbl\n", cpumask_pr_args(msk)); +#endif #ifdef CONFIG_GENERIC_PENDING_IRQ msk = desc->pending_mask; seq_printf(m, "pending: %*pbl\n", cpumask_pr_args(msk)); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 48d4f0365e52..35a95fadcfda 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -60,8 +60,19 @@ static int alloc_masks(struct irq_desc *desc, int node) GFP_KERNEL, node)) return -ENOMEM; +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, + GFP_KERNEL, node)) { + free_cpumask_var(desc->irq_common_data.affinity); + return -ENOMEM; + } +#endif + #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + free_cpumask_var(desc->irq_common_data.effective_affinity); +#endif free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } @@ -324,6 +335,9 @@ static void free_masks(struct irq_desc *desc) free_cpumask_var(desc->pending_mask); #endif free_cpumask_var(desc->irq_common_data.affinity); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + free_cpumask_var(desc->irq_common_data.effective_affinity); +#endif } #else static inline void free_masks(struct irq_desc *desc) { } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index cbc4c5e377ec..7f9642a1e267 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -37,19 +37,47 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP +enum { + AFFINITY, + AFFINITY_LIST, + EFFECTIVE, + EFFECTIVE_LIST, +}; + static int show_irq_affinity(int type, struct seq_file *m) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->irq_common_data.affinity; + const struct cpumask *mask; + switch (type) { + case AFFINITY: + case AFFINITY_LIST: + mask = desc->irq_common_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (irqd_is_setaffinity_pending(&desc->irq_data)) - mask = desc->pending_mask; + if (irqd_is_setaffinity_pending(&desc->irq_data)) + mask = desc->pending_mask; #endif - if (type) + break; + case EFFECTIVE: + case EFFECTIVE_LIST: +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + mask = desc->irq_common_data.effective_affinity; + break; +#else + return -EINVAL; +#endif + }; + + switch (type) { + case AFFINITY_LIST: + case EFFECTIVE_LIST: seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); - else + break; + case AFFINITY: + case EFFECTIVE: seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); + break; + } return 0; } @@ -80,12 +108,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(0, m); + return show_irq_affinity(AFFINITY, m); } static int irq_affinity_list_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(1, m); + return show_irq_affinity(AFFINITY_LIST, m); } @@ -185,6 +213,44 @@ static const struct file_operations irq_affinity_list_proc_fops = { .write = irq_affinity_list_proc_write, }; +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK +static int irq_effective_aff_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(EFFECTIVE, m); +} + +static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(EFFECTIVE_LIST, m); +} + +static int irq_effective_aff_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode)); +} + +static int irq_effective_aff_list_proc_open(struct inode *inode, + struct file *file) +{ + return single_open(file, irq_effective_aff_list_proc_show, + PDE_DATA(inode)); +} + +static const struct file_operations irq_effective_aff_proc_fops = { + .open = irq_effective_aff_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations irq_effective_aff_list_proc_fops = { + .open = irq_effective_aff_list_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int default_affinity_show(struct seq_file *m, void *v) { seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); @@ -364,6 +430,12 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("node", 0444, desc->dir, &irq_node_proc_fops, irqp); +# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + proc_create_data("effective_affinity", 0444, desc->dir, + &irq_effective_aff_proc_fops, irqp); + proc_create_data("effective_affinity_list", 0444, desc->dir, + &irq_effective_aff_list_proc_fops, irqp); +# endif #endif proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); @@ -383,6 +455,10 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) remove_proc_entry("affinity_hint", desc->dir); remove_proc_entry("smp_affinity_list", desc->dir); remove_proc_entry("node", desc->dir); +# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + remove_proc_entry("effective_affinity", desc->dir); + remove_proc_entry("effective_affinity_list", desc->dir); +# endif #endif remove_proc_entry("spurious", desc->dir); -- cgit v1.2.3 From 415fcf1a2293046e0c1f4ab8558a87bad66652b1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:39 +0200 Subject: genirq/cpuhotplug: Use effective affinity mask If the architecture supports the effective affinity mask, migrating interrupts away which are not targeted by the effective mask is pointless. They can stay in the user or system supplied affinity mask, but won't be targetted at any given point as the affinity setter functions need to validate against the online cpu mask anyway. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.328488490@linutronix.de --- kernel/irq/cpuhotplug.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index e09cb91a7c8b..0b093db3336b 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -14,6 +14,14 @@ #include "internals.h" +/* For !GENERIC_IRQ_EFFECTIVE_AFF_MASK this looks at general affinity mask */ +static inline bool irq_needs_fixup(struct irq_data *d) +{ + const struct cpumask *m = irq_data_get_effective_affinity_mask(d); + + return cpumask_test_cpu(smp_processor_id(), m); +} + static bool migrate_one_irq(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -42,9 +50,7 @@ static bool migrate_one_irq(struct irq_desc *desc) * Note: Do not check desc->action as this might be a chained * interrupt. */ - affinity = irq_data_get_affinity_mask(d); - if (irqd_is_per_cpu(d) || !irqd_is_started(d) || - !cpumask_test_cpu(smp_processor_id(), affinity)) { + if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !irq_needs_fixup(d)) { /* * If an irq move is pending, abort it if the dying CPU is * the sole target. @@ -69,6 +75,8 @@ static bool migrate_one_irq(struct irq_desc *desc) */ if (irq_fixup_move_pending(desc, true)) affinity = irq_desc_get_pending_mask(desc); + else + affinity = irq_data_get_affinity_mask(d); /* Mask the chip for interrupts which cannot move in process context */ if (maskchip && chip->irq_mask) -- cgit v1.2.3 From 54fdf6a0875ca380647ac1cc9b5b8f2dbbbfa131 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:47 +0200 Subject: genirq: Introduce IRQD_MANAGED_SHUTDOWN Affinity managed interrupts should keep their assigned affinity accross CPU hotplug. To avoid magic hackery in device drivers, the core code shall manage them transparently. This will set these interrupts into a managed shutdown state when the last CPU of the assigned affinity mask goes offline. The interrupt will be restarted when one of the CPUs in the assigned affinity mask comes back online. Introduce the necessary state flag and the accessor functions. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.954523476@linutronix.de --- kernel/irq/internals.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 040806f1124c..ca4666b4cd39 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -193,6 +193,16 @@ static inline void irqd_clr_move_pending(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; } +static inline void irqd_set_managed_shutdown(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN; +} + +static inline void irqd_clr_managed_shutdown(struct irq_data *d) +{ + __irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN; +} + static inline void irqd_clear(struct irq_data *d, unsigned int mask) { __irqd_to_state(d) &= ~mask; -- cgit v1.2.3 From 708d174b6c32bffc5d73793bc7a267bcafeb6558 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:48 +0200 Subject: genirq: Split out irq_startup() code Split out the inner workings of irq_startup() so it can be reused to handle managed interrupts gracefully. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.033235144@linutronix.de --- kernel/irq/chip.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e290d73b88e2..1163089aa245 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -195,6 +195,23 @@ static void irq_state_set_started(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); } +static int __irq_startup(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + int ret = 0; + + irq_domain_activate_irq(d); + if (d->chip->irq_startup) { + ret = d->chip->irq_startup(d); + irq_state_clr_disabled(desc); + irq_state_clr_masked(desc); + } else { + irq_enable(desc); + } + irq_state_set_started(desc); + return ret; +} + int irq_startup(struct irq_desc *desc, bool resend) { int ret = 0; @@ -204,19 +221,9 @@ int irq_startup(struct irq_desc *desc, bool resend) if (irqd_is_started(&desc->irq_data)) { irq_enable(desc); } else { - irq_domain_activate_irq(&desc->irq_data); - if (desc->irq_data.chip->irq_startup) { - ret = desc->irq_data.chip->irq_startup(&desc->irq_data); - irq_state_clr_disabled(desc); - irq_state_clr_masked(desc); - } else { - irq_enable(desc); - } - irq_state_set_started(desc); - /* Set default affinity mask once everything is setup */ + ret = __irq_startup(desc); irq_setup_affinity(desc); } - if (resend) check_irq_resend(desc); -- cgit v1.2.3 From 4cde9c6b826834b861a2b58653ab33150f562064 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:49 +0200 Subject: genirq: Add force argument to irq_startup() In order to handle managed interrupts gracefully on irq_startup() so they won't lose their assigned affinity, it's necessary to allow startups which keep the interrupts in managed shutdown state, if none of the assigend CPUs is online. This allows drivers to request interrupts w/o the CPUs being online, which avoid online/offline churn in drivers. Add a force argument which can override that decision and let only request_irq() and enable_irq() allow the managed shutdown handling. enable_irq() is required, because the interrupt might be requested with IRQF_NOAUTOEN and enable_irq() invokes irq_startup() which would then wreckage the assignment again. All other callers force startup and potentially break the assigned affinity. No functional change as this only adds the function argument. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.112094565@linutronix.de --- kernel/irq/autoprobe.c | 4 ++-- kernel/irq/chip.c | 4 ++-- kernel/irq/internals.h | 9 ++++++++- kernel/irq/manage.c | 4 ++-- 4 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 0119b9d467ae..d30a0dd5cc02 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc, false); + irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE); } raw_spin_unlock_irq(&desc->lock); } @@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, false)) + if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1163089aa245..b7599e952d3b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -212,7 +212,7 @@ static int __irq_startup(struct irq_desc *desc) return ret; } -int irq_startup(struct irq_desc *desc, bool resend) +int irq_startup(struct irq_desc *desc, bool resend, bool force) { int ret = 0; @@ -892,7 +892,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; - irq_startup(desc, true); + irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); } } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ca4666b4cd39..5fd105e252c3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -66,7 +66,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); extern void __disable_irq(struct irq_desc *desc); extern void __enable_irq(struct irq_desc *desc); -extern int irq_startup(struct irq_desc *desc, bool resend); +#define IRQ_RESEND true +#define IRQ_NORESEND false + +#define IRQ_START_FORCE true +#define IRQ_START_COND false + +extern int irq_startup(struct irq_desc *desc, bool resend, bool force); + extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7dcf19397c39..3577c091ac7b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -509,7 +509,7 @@ void __enable_irq(struct irq_desc *desc) * time. If it was already started up, then irq_startup() * will invoke irq_enable() under the hood. */ - irq_startup(desc, true); + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); break; } default: @@ -1306,7 +1306,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (irq_settings_can_autoenable(desc)) { - irq_startup(desc, true); + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* * Shared interrupts do not go well with disabling -- cgit v1.2.3 From 761ea388e8c4e3ac883a94e16bcc8c51fa419d4f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:50 +0200 Subject: genirq: Handle managed irqs gracefully in irq_startup() Affinity managed interrupts should keep their assigned affinity accross CPU hotplug. To avoid magic hackery in device drivers, the core code shall manage them transparently and set these interrupts into a managed shutdown state when the last CPU of the assigned affinity mask goes offline. The interrupt will be restarted when one of the CPUs in the assigned affinity mask comes back online. Add the necessary logic to irq_startup(). If an interrupt is requested and started up, the code checks whether it is affinity managed and if so, it checks whether a CPU in the interrupts affinity mask is online. If not, it puts the interrupt into managed shutdown state. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.189851170@linutronix.de --- kernel/irq/chip.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7599e952d3b..fc89eeb8a6b4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -195,6 +195,52 @@ static void irq_state_set_started(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); } +enum { + IRQ_STARTUP_NORMAL, + IRQ_STARTUP_MANAGED, + IRQ_STARTUP_ABORT, +}; + +#ifdef CONFIG_SMP +static int +__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + + if (!irqd_affinity_is_managed(d)) + return IRQ_STARTUP_NORMAL; + + irqd_clr_managed_shutdown(d); + + if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { + /* + * Catch code which fiddles with enable_irq() on a managed + * and potentially shutdown IRQ. Chained interrupt + * installment or irq auto probing should not happen on + * managed irqs either. Emit a warning, break the affinity + * and start it up as a normal interrupt. + */ + if (WARN_ON_ONCE(force)) + return IRQ_STARTUP_NORMAL; + /* + * The interrupt was requested, but there is no online CPU + * in it's affinity mask. Put it into managed shutdown + * state and let the cpu hotplug mechanism start it up once + * a CPU in the mask becomes available. + */ + irqd_set_managed_shutdown(d); + return IRQ_STARTUP_ABORT; + } + return IRQ_STARTUP_MANAGED; +} +#else +static int +__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) +{ + return IRQ_STARTUP_NORMAL; +} +#endif + static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -214,15 +260,27 @@ static int __irq_startup(struct irq_desc *desc) int irq_startup(struct irq_desc *desc, bool resend, bool force) { + struct irq_data *d = irq_desc_get_irq_data(desc); + struct cpumask *aff = irq_data_get_affinity_mask(d); int ret = 0; desc->depth = 0; - if (irqd_is_started(&desc->irq_data)) { + if (irqd_is_started(d)) { irq_enable(desc); } else { - ret = __irq_startup(desc); - irq_setup_affinity(desc); + switch (__irq_startup_managed(desc, aff, force)) { + case IRQ_STARTUP_NORMAL: + ret = __irq_startup(desc); + irq_setup_affinity(desc); + break; + case IRQ_STARTUP_MANAGED: + ret = __irq_startup(desc); + irq_set_affinity_locked(d, aff, false); + break; + case IRQ_STARTUP_ABORT: + return 0; + } } if (resend) check_irq_resend(desc); -- cgit v1.2.3 From c5cb83bb337c25caae995d992d1cdf9b317f83de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:51 +0200 Subject: genirq/cpuhotplug: Handle managed IRQs on CPU hotplug If a CPU goes offline, interrupts affine to the CPU are moved away. If the outgoing CPU is the last CPU in the affinity mask the migration code breaks the affinity and sets it it all online cpus. This is a problem for affinity managed interrupts as CPU hotplug is often used for power management purposes. If the affinity is broken, the interrupt is not longer affine to the CPUs to which it was allocated. The affinity spreading allows to lay out multi queue devices in a way that they are assigned to a single CPU or a group of CPUs. If the last CPU goes offline, then the queue is not longer used, so the interrupt can be shutdown gracefully and parked until one of the assigned CPUs comes online again. Add a graceful shutdown mechanism into the irq affinity breaking code path, mark the irq as MANAGED_SHUTDOWN and leave the affinity mask unmodified. In the online path, scan the active interrupts for managed interrupts and if the interrupt is functional and the newly online CPU is part of the affinity mask, restart the interrupt if it is marked MANAGED_SHUTDOWN or if the interrupts is started up, try to add the CPU back to the effective affinity mask. Originally-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170619235447.273417334@linutronix.de --- kernel/cpu.c | 5 +++++ kernel/irq/cpuhotplug.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index cb5103413bd8..b86b32ebb3b2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1252,6 +1252,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup.single = smpboot_unpark_threads, .teardown.single = NULL, }, + [CPUHP_AP_IRQ_AFFINITY_ONLINE] = { + .name = "irq/affinity:online", + .startup.single = irq_affinity_online_cpu, + .teardown.single = NULL, + }, [CPUHP_AP_PERF_ONLINE] = { .name = "perf:online", .startup.single = perf_event_init_cpu, diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 0b093db3336b..b7964e72ded7 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -83,6 +83,15 @@ static bool migrate_one_irq(struct irq_desc *desc) chip->irq_mask(d); if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + /* + * If the interrupt is managed, then shut it down and leave + * the affinity untouched. + */ + if (irqd_affinity_is_managed(d)) { + irqd_set_managed_shutdown(d); + irq_shutdown(desc); + return false; + } affinity = cpu_online_mask; brokeaff = true; } @@ -129,3 +138,39 @@ void irq_migrate_all_off_this_cpu(void) } } } + +static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + const struct cpumask *affinity = irq_data_get_affinity_mask(data); + + if (!irqd_affinity_is_managed(data) || !desc->action || + !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) + return; + + if (irqd_is_managed_and_shutdown(data)) + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + else + irq_set_affinity_locked(data, affinity, false); +} + +/** + * irq_affinity_online_cpu - Restore affinity for managed interrupts + * @cpu: Upcoming CPU for which interrupts should be restored + */ +int irq_affinity_online_cpu(unsigned int cpu) +{ + struct irq_desc *desc; + unsigned int irq; + + irq_lock_sparse(); + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + raw_spin_lock_irq(&desc->lock); + irq_restore_affinity_of_irq(desc, cpu); + raw_spin_unlock_irq(&desc->lock); + } + irq_unlock_sparse(); + + return 0; +} -- cgit v1.2.3 From d52dd44175bd27ad9d8e34a994fb80877c1f6d61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:52 +0200 Subject: genirq: Introduce IRQD_SINGLE_TARGET flag Many interrupt chips allow only a single CPU as interrupt target. The core code has no knowledge about that. That's unfortunate as it could avoid trying to readd a newly online CPU to the effective affinity mask. Add the status flag and the necessary accessors. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.352343969@linutronix.de --- kernel/irq/debugfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index edbef252d0c4..dbd6e78db213 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -105,6 +105,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_PER_CPU), BIT_MASK_DESCR(IRQD_NO_BALANCING), + BIT_MASK_DESCR(IRQD_SINGLE_TARGET), BIT_MASK_DESCR(IRQD_MOVE_PCNTXT), BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), -- cgit v1.2.3 From 8f31a9845db348f5781df47ce04c79e4cfe90016 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:53 +0200 Subject: genirq/cpuhotplug: Avoid irq affinity setting for single targets Avoid trying to add a newly online CPU to the effective affinity mask of an started up interrupt. That interrupt will either stay on the already online CPU or move around for no value. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.431321047@linutronix.de --- kernel/irq/cpuhotplug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index b7964e72ded7..aee8f7ec40af 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -148,9 +148,17 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - if (irqd_is_managed_and_shutdown(data)) + if (irqd_is_managed_and_shutdown(data)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); - else + return; + } + + /* + * If the interrupt can only be directed to a single target + * CPU then it is already assigned to a CPU in the affinity + * mask. No point in trying to move it around. + */ + if (!irqd_is_single_target(data)) irq_set_affinity_locked(data, affinity, false); } -- cgit v1.2.3 From 9a0ef98e186d86fb3c1ff3ec267a76f067005f74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Jun 2017 01:37:55 +0200 Subject: genirq/affinity: Assign vectors to all present CPUs Currently the irq vector spread algorithm is restricted to online CPUs, which ties the IRQ mapping to the currently online devices and doesn't deal nicely with the fact that CPUs could come and go rapidly due to e.g. power management. Instead assign vectors to all present CPUs to avoid this churn. Build a map of all possible CPUs for a given node, as the architectures only provide a map of all onlines CPUs. Do this dynamically on each call for the vector assingments, which is a bit suboptimal and could be optimized in the future by provinding a mapping from the arch code. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Sagi Grimberg Cc: Marc Zyngier Cc: Michael Ellerman Cc: linux-nvme@lists.infradead.org Cc: Keith Busch Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170603140403.27379-5-hch@lst.de --- kernel/irq/affinity.c | 76 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e2d356dd7581..d2747f9c5707 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -1,4 +1,7 @@ - +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ #include #include #include @@ -35,13 +38,54 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) +static cpumask_var_t *alloc_node_to_present_cpumask(void) +{ + cpumask_var_t *masks; + int node; + + masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); + if (!masks) + return NULL; + + for (node = 0; node < nr_node_ids; node++) { + if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) + goto out_unwind; + } + + return masks; + +out_unwind: + while (--node >= 0) + free_cpumask_var(masks[node]); + kfree(masks); + return NULL; +} + +static void free_node_to_present_cpumask(cpumask_var_t *masks) +{ + int node; + + for (node = 0; node < nr_node_ids; node++) + free_cpumask_var(masks[node]); + kfree(masks); +} + +static void build_node_to_present_cpumask(cpumask_var_t *masks) +{ + int cpu; + + for_each_present_cpu(cpu) + cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); +} + +static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, + const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ - for_each_online_node(n) { - if (cpumask_intersects(mask, cpumask_of_node(n))) { + for_each_node(n) { + if (cpumask_intersects(mask, node_to_present_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -64,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; - cpumask_var_t nmsk; + cpumask_var_t nmsk, *node_to_present_cpumask; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -73,13 +117,19 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!masks) goto out; + node_to_present_cpumask = alloc_node_to_present_cpumask(); + if (!node_to_present_cpumask) + goto out; + /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); /* Stabilize the cpumasks */ get_online_cpus(); - nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); + build_node_to_present_cpumask(node_to_present_cpumask); + nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, + &nodemsk); /* * If the number of nodes in the mask is greater than or equal the @@ -87,7 +137,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) */ if (affv <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, cpumask_of_node(n)); + cpumask_copy(masks + curvec, + node_to_present_cpumask[n]); if (++curvec == last_affv) break; } @@ -101,7 +152,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); + cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -133,6 +184,7 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); + free_node_to_present_cpumask(node_to_present_cpumask); out: free_cpumask_var(nmsk); return masks; @@ -147,12 +199,10 @@ int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) { int resv = affd->pre_vectors + affd->post_vectors; int vecs = maxvec - resv; - int cpus; + int ret; - /* Stabilize the cpumasks */ get_online_cpus(); - cpus = cpumask_weight(cpu_online_mask); + ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; put_online_cpus(); - - return min(cpus, vecs) + resv; + return ret; } -- cgit v1.2.3 From 61d0a000b7746665c7cfcff766532f6f2a922a61 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 22 Jun 2017 11:34:57 +0100 Subject: genirq/irqdomain: Add irq_domain_update_bus_token helper We can have irq domains that are identified by the same fwnode (because they are serviced by the same HW), and yet have different functionnality (because they serve different busses, for example). This is what we use the bus_token field. Since we don't use this field when generating the domain name, all the aliasing domains will get the same name, and the debugfs file creation fails. Also, bus_token is updated by individual drivers, and the core code is unaware of that update. In order to sort this mess, let's introduce a helper that takes care of updating bus_token, and regenerate the debugfs file. A separate patch will update all the individual users. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 75e1f0851c33..f6adeaeb4c16 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -245,6 +245,37 @@ void irq_domain_remove(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_domain_remove); +void irq_domain_update_bus_token(struct irq_domain *domain, + enum irq_domain_bus_token bus_token) +{ + char *name; + + if (domain->bus_token == bus_token) + return; + + mutex_lock(&irq_domain_mutex); + + domain->bus_token = bus_token; + + name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token); + if (!name) { + mutex_unlock(&irq_domain_mutex); + return; + } + + debugfs_remove_domain_dir(domain); + + if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) + kfree(domain->name); + else + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + + domain->name = name; + debugfs_add_domain_dir(domain); + + mutex_unlock(&irq_domain_mutex); +} + /** * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs * @of_node: pointer to interrupt controller's device tree node. -- cgit v1.2.3 From 6a6544e520abecd484ab8b67fb50d1fc003f3275 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 20 Jun 2017 22:17:44 +0100 Subject: genirq/irqdomain: Remove auto-recursive hierarchy support It did seem like a good idea at the time, but it never really caught on, and auto-recursive domains remain unused 3 years after having been introduced. Oh well, time for a late spring cleanup. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 55 ++++++++++++-------------------------------------- kernel/irq/msi.c | 2 +- 2 files changed, 14 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f6adeaeb4c16..14fe862aa2e3 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1342,43 +1342,18 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_common(domain, virq, nr_irqs); } -static bool irq_domain_is_auto_recursive(struct irq_domain *domain) -{ - return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; -} - -static void irq_domain_free_irqs_recursive(struct irq_domain *domain, +static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { domain->ops->free(domain, irq_base, nr_irqs); - if (irq_domain_is_auto_recursive(domain)) { - BUG_ON(!domain->parent); - irq_domain_free_irqs_recursive(domain->parent, irq_base, - nr_irqs); - } } -int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, +int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { - int ret = 0; - struct irq_domain *parent = domain->parent; - bool recursive = irq_domain_is_auto_recursive(domain); - - BUG_ON(recursive && !parent); - if (recursive) - ret = irq_domain_alloc_irqs_recursive(parent, irq_base, - nr_irqs, arg); - if (ret < 0) - return ret; - - ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); - if (ret < 0 && recursive) - irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); - - return ret; + return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } /** @@ -1439,7 +1414,7 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, } mutex_lock(&irq_domain_mutex); - ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); if (ret < 0) { mutex_unlock(&irq_domain_mutex); goto out_free_irq_data; @@ -1474,7 +1449,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) mutex_lock(&irq_domain_mutex); for (i = 0; i < nr_irqs; i++) irq_domain_remove_irq(virq + i); - irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); + irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs); mutex_unlock(&irq_domain_mutex); irq_domain_free_irq_data(virq, nr_irqs); @@ -1494,15 +1469,11 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { - /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ - if (irq_domain_is_auto_recursive(domain)) - return 0; + if (!domain->parent) + return -ENOSYS; - domain = domain->parent; - if (domain) - return irq_domain_alloc_irqs_recursive(domain, irq_base, - nr_irqs, arg); - return -ENOSYS; + return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base, + nr_irqs, arg); } EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); @@ -1517,10 +1488,10 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { - /* irq_domain_free_irqs_recursive() will call parent's free */ - if (!irq_domain_is_auto_recursive(domain) && domain->parent) - irq_domain_free_irqs_recursive(domain->parent, irq_base, - nr_irqs); + if (!domain->parent) + return; + + irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9e3f1857c6bd..48eadf416c24 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -315,7 +315,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, ops->set_desc(arg, desc); /* Assumes the domain mutex is held! */ - ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (ret) break; -- cgit v1.2.3 From feaf1283d11794b9d518fcfd54b6bf8bee1f0b4b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jun 2017 17:04:55 -0400 Subject: tracing: Show address when function names are not found Currently, when a function is not found in kallsyms, instead of simply showing the function address, it shows nothing at all: # echo ':mod:kvm_intel' > /sys/kernel/tracing/set_ftrace_filter # echo function > /sys/kernel/tracing/set_ftrace_filter # qemu -enable-kvm /home/my-qemu-image # rmmod kvm_intel # cat /sys/kernel/tracing/trace qemu-system-x86-2408 [001] d..2 135.013238: <-kvm_arch_hardware_enable qemu-system-x86-2408 [001] .... 135.014574: <-kvm_arch_vm_ioctl qemu-system-x86-2408 [001] .... 135.015420: <-kvm_vm_ioctl_check_extension qemu-system-x86-2408 [001] .... 135.045411: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] ...1 135.045413: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045413: <-__do_cpuid_ent When it should show: qemu-system-x86-2408 [001] d..2 135.013238: 0xffffffffa02a39f0 <-kvm_arch_hardware_enable qemu-system-x86-2408 [001] .... 135.014574: 0xffffffffa02a2ba0 <-kvm_arch_vm_ioctl qemu-system-x86-2408 [001] .... 135.015420: 0xffffffffa029e4e0 <-kvm_vm_ioctl_check_extension qemu-system-x86-2408 [001] .... 135.045411: 0xffffffffa02a1380 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e160 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e180 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e520 <-__do_cpuid_ent qemu-system-x86-2408 [001] ...1 135.045413: 0xffffffffa02a13b0 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045413: 0xffffffffa02a1380 <-__do_cpuid_ent instead. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 08f9bab8089e..01ff99969ca7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name) static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { -#ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_KALLSYMS const char *name; kallsyms_lookup(address, NULL, NULL, NULL, str); name = kretprobed(str); - trace_seq_printf(s, fmt, name); + if (name && strlen(name)) { + trace_seq_printf(s, fmt, name); + return; + } #endif + snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); + trace_seq_printf(s, fmt, str); } static void seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { -#ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_KALLSYMS const char *name; sprint_symbol(str, address); name = kretprobed(str); - trace_seq_printf(s, fmt, name); + if (name && strlen(name)) { + trace_seq_printf(s, fmt, name); + return; + } #endif + snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); + trace_seq_printf(s, fmt, str); } #ifndef CONFIG_64BIT -- cgit v1.2.3 From e1d4eeec5aaa28d25f249c0195b0e1d9b9feb7bd Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 14 Jun 2017 13:19:23 -0400 Subject: sched/cpuset: Only offer CONFIG_CPUSETS if SMP is enabled Make CONFIG_CPUSETS=y depend on SMP as this feature makes no sense on UP. This allows for configuring out cpuset_cpumask_can_shrink() and task_can_attach() entirely, which shrinks the kernel a bit. Signed-off-by: Nicolas Pitre Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170614171926.8345-2-nicolas.pitre@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 62166da1c359..7faf4b322b63 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5451,6 +5451,8 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +#ifdef CONFIG_SMP + int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -5494,7 +5496,6 @@ int task_can_attach(struct task_struct *p, goto out; } -#ifdef CONFIG_SMP if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, cs_cpus_allowed)) { unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, @@ -5524,13 +5525,11 @@ int task_can_attach(struct task_struct *p, rcu_read_unlock_sched(); } -#endif + out: return ret; } -#ifdef CONFIG_SMP - bool sched_smp_initialized __read_mostly; #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From 06a76fe08d4daaeea01ca0f175ad29f40c781ece Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 21 Jun 2017 14:22:01 -0400 Subject: sched/deadline: Move DL related code from sched/core.c to sched/deadline.c This helps making sched/core.c smaller and hopefully easier to understand and maintain. Signed-off-by: Nicolas Pitre Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170621182203.30626-2-nicolas.pitre@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 343 +---------------------------------------------- kernel/sched/deadline.c | 344 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 17 ++- 3 files changed, 364 insertions(+), 340 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7faf4b322b63..54e1b0700af3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2139,25 +2139,6 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } -/* - * This function clears the sched_dl_entity static params. - */ -void __dl_clear_params(struct task_struct *p) -{ - struct sched_dl_entity *dl_se = &p->dl; - - dl_se->dl_runtime = 0; - dl_se->dl_deadline = 0; - dl_se->dl_period = 0; - dl_se->flags = 0; - dl_se->dl_bw = 0; - dl_se->dl_density = 0; - - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - dl_se->dl_non_contending = 0; -} - /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -2438,101 +2419,6 @@ unsigned long to_ratio(u64 period, u64 runtime) return div64_u64(runtime << BW_SHIFT, period); } -#ifdef CONFIG_SMP -inline struct dl_bw *dl_bw_of(int i) -{ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - return &cpu_rq(i)->rd->dl_bw; -} - -inline int dl_bw_cpus(int i) -{ - struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; - - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; - - return cpus; -} -#else -inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -inline int dl_bw_cpus(int i) -{ - return 1; -} -#endif - -/* - * We must be sure that accepting a new task (or allowing changing the - * parameters of an existing one) is consistent with the bandwidth - * constraints. If yes, this function also accordingly updates the currently - * allocated bandwidth to reflect the new situation. - * - * This function is called while holding p's rq->lock. - */ -static int dl_overflow(struct task_struct *p, int policy, - const struct sched_attr *attr) -{ - - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period ?: attr->sched_deadline; - u64 runtime = attr->sched_runtime; - u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; - - /* !deadline task may carry old deadline bandwidth */ - if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) - return 0; - - /* - * Either if a task, enters, leave, or stays -deadline but changes - * its parameters, we may need to update accordingly the total - * allocated bandwidth of the container. - */ - raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); - if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { - if (hrtimer_active(&p->dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw, cpus); - __dl_add(dl_b, new_bw, cpus); - err = 0; - } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { - /* - * XXX this is slightly incorrect: when the task - * utilization decreases, we should delay the total - * utilization change until the task's 0-lag point. - * But this would require to set the task's "inactive - * timer" when the task is not inactive. - */ - __dl_clear(dl_b, p->dl.dl_bw, cpus); - __dl_add(dl_b, new_bw, cpus); - dl_change_utilization(p, new_bw); - err = 0; - } else if (!dl_policy(policy) && task_has_dl_policy(p)) { - /* - * Do not decrease the total deadline utilization here, - * switched_from_dl() will take care to do it at the correct - * (0-lag) time. - */ - err = 0; - } - raw_spin_unlock(&dl_b->lock); - - return err; -} - -extern void init_dl_bw(struct dl_bw *dl_b); - /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -4014,27 +3900,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -/* - * This function initializes the sched_dl_entity of a newly becoming - * SCHED_DEADLINE task. - * - * Only the static values are considered here, the actual runtime and the - * absolute deadline will be properly calculated when the task is enqueued - * for the first time with its new policy. - */ -static void -__setparam_dl(struct task_struct *p, const struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - dl_se->dl_runtime = attr->sched_runtime; - dl_se->dl_deadline = attr->sched_deadline; - dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; - dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); -} - /* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. @@ -4088,59 +3953,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &fair_sched_class; } -static void -__getparam_dl(struct task_struct *p, struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; - attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; -} - -/* - * This function validates the new parameters of a -deadline task. - * We ask for the deadline not being zero, and greater or equal - * than the runtime, as well as the period of being zero or - * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution of 1us (we - * check sched_runtime only since it is always the smaller one) and - * below 2^63 ns (we have to check both sched_deadline and - * sched_period, as the latter can be zero). - */ -static bool -__checkparam_dl(const struct sched_attr *attr) -{ - /* deadline != 0 */ - if (attr->sched_deadline == 0) - return false; - - /* - * Since we truncate DL_SCALE bits, make sure we're at least - * that big. - */ - if (attr->sched_runtime < (1ULL << DL_SCALE)) - return false; - - /* - * Since we use the MSB for wrap-around and sign issues, make - * sure it's not set (mind that period can be equal to zero). - */ - if (attr->sched_deadline & (1ULL << 63) || - attr->sched_period & (1ULL << 63)) - return false; - - /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || - attr->sched_deadline < attr->sched_runtime) - return false; - - return true; -} - /* * Check the target process has a UID that matches the current process's: */ @@ -4157,19 +3969,6 @@ static bool check_same_owner(struct task_struct *p) return match; } -static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - if (dl_se->dl_runtime != attr->sched_runtime || - dl_se->dl_deadline != attr->sched_deadline || - dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) - return true; - - return false; -} - static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) @@ -4350,7 +4149,7 @@ change: * of a SCHED_DEADLINE task) we need to check if enough bandwidth * is available. */ - if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { + if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { task_rq_unlock(rq, p, &rf); return -EBUSY; } @@ -5456,23 +5255,12 @@ void init_idle(struct task_struct *idle, int cpu) int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; - struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; if (!cpumask_weight(cur)) return ret; - rcu_read_lock_sched(); - cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - - raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) - ret = 0; - raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); - rcu_read_unlock_sched(); + ret = dl_cpuset_cpumask_can_shrink(cur, trial); return ret; } @@ -5497,34 +5285,8 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) { - unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, - cs_cpus_allowed); - struct dl_bw *dl_b; - bool overflow; - int cpus; - unsigned long flags; - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) - ret = -EBUSY; - else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, cpus); - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - } + cs_cpus_allowed)) + ret = dl_task_can_attach(p, cs_cpus_allowed); out: return ret; @@ -5792,23 +5554,8 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { - unsigned long flags; - struct dl_bw *dl_b; - bool overflow; - int cpus; - if (!cpuhp_tasks_frozen) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (overflow) + if (dl_cpu_busy(cpu)) return -EBUSY; cpuset_update_active_cpus(); } else { @@ -6711,84 +6458,6 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ -static int sched_dl_global_validate(void) -{ - u64 runtime = global_rt_runtime(); - u64 period = global_rt_period(); - u64 new_bw = to_ratio(period, runtime); - struct dl_bw *dl_b; - int cpu, ret = 0; - unsigned long flags; - - /* - * Here we want to check the bandwidth not being set to some - * value smaller than the currently allocated bandwidth in - * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! - */ - for_each_possible_cpu(cpu) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) - ret = -EBUSY; - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (ret) - break; - } - - return ret; -} - -void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) -{ - if (global_rt_runtime() == RUNTIME_INF) { - dl_rq->bw_ratio = 1 << RATIO_SHIFT; - dl_rq->extra_bw = 1 << BW_SHIFT; - } else { - dl_rq->bw_ratio = to_ratio(global_rt_runtime(), - global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); - dl_rq->extra_bw = to_ratio(global_rt_period(), - global_rt_runtime()); - } -} - -static void sched_dl_do_global(void) -{ - u64 new_bw = -1; - struct dl_bw *dl_b; - int cpu; - unsigned long flags; - - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - - if (global_rt_runtime() != RUNTIME_INF) - new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - - /* - * FIXME: As above... - */ - for_each_possible_cpu(cpu) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - dl_b->bw = new_bw; - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); - } -} - static int sched_rt_global_validate(void) { if (sysctl_sched_rt_period <= 0) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e12f85975857..a84299f44b5d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ #include "sched.h" #include +#include struct dl_bandwidth def_dl_bandwidth; @@ -43,6 +44,38 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_SMP +static inline struct dl_bw *dl_bw_of(int i) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; + + return cpus; +} +#else +static inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + return 1; +} +#endif + static inline void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { @@ -2318,6 +2351,317 @@ const struct sched_class dl_sched_class = { .update_curr = update_curr_dl, }; +int sched_dl_global_validate(void) +{ + u64 runtime = global_rt_runtime(); + u64 period = global_rt_period(); + u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; + int cpu, ret = 0; + unsigned long flags; + + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(cpu) { + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + if (new_bw < dl_b->total_bw) + ret = -EBUSY; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + + if (ret) + break; + } + + return ret; +} + +void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +{ + if (global_rt_runtime() == RUNTIME_INF) { + dl_rq->bw_ratio = 1 << RATIO_SHIFT; + dl_rq->extra_bw = 1 << BW_SHIFT; + } else { + dl_rq->bw_ratio = to_ratio(global_rt_runtime(), + global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + dl_rq->extra_bw = to_ratio(global_rt_period(), + global_rt_runtime()); + } +} + +void sched_dl_do_global(void) +{ + u64 new_bw = -1; + struct dl_bw *dl_b; + int cpu; + unsigned long flags; + + def_dl_bandwidth.dl_period = global_rt_period(); + def_dl_bandwidth.dl_runtime = global_rt_runtime(); + + if (global_rt_runtime() != RUNTIME_INF) + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + + /* + * FIXME: As above... + */ + for_each_possible_cpu(cpu) { + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + dl_b->bw = new_bw; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + } +} + +/* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +int sched_dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period ?: attr->sched_deadline; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus, err = -1; + + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + if (hrtimer_active(&p->dl.inactive_timer)) + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + /* + * XXX this is slightly incorrect: when the task + * utilization decreases, we should delay the total + * utilization change until the task's 0-lag point. + * But this would require to set the task's "inactive + * timer" when the task is not inactive. + */ + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + dl_change_utilization(p, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + /* + * Do not decrease the total deadline utilization here, + * switched_from_dl() will take care to do it at the correct + * (0-lag) time. + */ + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */ +void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); +} + +void __getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; + attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). + */ +bool __checkparam_dl(const struct sched_attr *attr) +{ + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; +} + +/* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; + dl_se->dl_density = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; +} + +bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; + + return false; +} + +#ifdef CONFIG_SMP +int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) +{ + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus, ret; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw, cpus); + ret = 0; + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + return ret; +} + +int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + return ret; +} + +bool dl_cpu_busy(unsigned int cpu) +{ + unsigned long flags; + struct dl_bw *dl_b; + bool overflow; + int cpus; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + return overflow; +} +#endif + #ifdef CONFIG_SCHED_DEBUG extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0329d10bdb8..d4eb3f67529d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -218,9 +218,6 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } -extern struct dl_bw *dl_bw_of(int i); -extern int dl_bw_cpus(int i); - struct dl_bw { raw_spinlock_t lock; u64 bw, total_bw; @@ -251,6 +248,20 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern void __dl_clear_params(struct task_struct *p); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); +extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED -- cgit v1.2.3 From 8887cd99038bf242fb47f2d07fa0cf9371efa643 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 21 Jun 2017 14:22:02 -0400 Subject: sched/rt: Move RT related code from sched/core.c to sched/rt.c This helps making sched/core.c smaller and hopefully easier to understand and maintain. Signed-off-by: Nicolas Pitre Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170621182203.30626-3-nicolas.pitre@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 315 --------------------------------------------------- kernel/sched/rt.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 5 + 3 files changed, 315 insertions(+), 315 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 54e1b0700af3..5186797908dc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6224,321 +6224,6 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, tsk, &rf); } -#endif /* CONFIG_CGROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) -{ - struct task_struct *g, *p; - - /* - * Autogroups do not have RT tasks; see autogroup_create(). - */ - if (task_group_is_autogroup(tg)) - return 0; - - for_each_process_thread(g, p) { - if (rt_task(p) && task_group(p) == tg) - return 1; - } - - return 0; -} - -struct rt_schedulable_data { - struct task_group *tg; - u64 rt_period; - u64 rt_runtime; -}; - -static int tg_rt_schedulable(struct task_group *tg, void *data) -{ - struct rt_schedulable_data *d = data; - struct task_group *child; - unsigned long total, sum = 0; - u64 period, runtime; - - period = ktime_to_ns(tg->rt_bandwidth.rt_period); - runtime = tg->rt_bandwidth.rt_runtime; - - if (tg == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - /* - * Cannot have more runtime than the period. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - /* - * Ensure we don't starve existing RT tasks. - */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) - return -EBUSY; - - total = to_ratio(period, runtime); - - /* - * Nobody can have more than the global setting allows. - */ - if (total > to_ratio(global_rt_period(), global_rt_runtime())) - return -EINVAL; - - /* - * The sum of our children's runtime should not exceed our own. - */ - list_for_each_entry_rcu(child, &tg->children, siblings) { - period = ktime_to_ns(child->rt_bandwidth.rt_period); - runtime = child->rt_bandwidth.rt_runtime; - - if (child == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - sum += to_ratio(period, runtime); - } - - if (sum > total) - return -EINVAL; - - return 0; -} - -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - int ret; - - struct rt_schedulable_data data = { - .tg = tg, - .rt_period = period, - .rt_runtime = runtime, - }; - - rcu_read_lock(); - ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); - rcu_read_unlock(); - - return ret; -} - -static int tg_set_rt_bandwidth(struct task_group *tg, - u64 rt_period, u64 rt_runtime) -{ - int i, err = 0; - - /* - * Disallowing the root group RT runtime is BAD, it would disallow the - * kernel creating (and or operating) RT threads. - */ - if (tg == &root_task_group && rt_runtime == 0) - return -EINVAL; - - /* No period doesn't make any sense. */ - if (rt_period == 0) - return -EINVAL; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - err = __rt_schedulable(tg, rt_period, rt_runtime); - if (err) - goto unlock; - - raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); - tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); - tg->rt_bandwidth.rt_runtime = rt_runtime; - - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = tg->rt_rq[i]; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = rt_runtime; - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); -unlock: - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return err; -} - -static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) -{ - u64 rt_runtime, rt_period; - - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us < 0) - rt_runtime = RUNTIME_INF; - - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); -} - -static long sched_group_rt_runtime(struct task_group *tg) -{ - u64 rt_runtime_us; - - if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) - return -1; - - rt_runtime_us = tg->rt_bandwidth.rt_runtime; - do_div(rt_runtime_us, NSEC_PER_USEC); - return rt_runtime_us; -} - -static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) -{ - u64 rt_runtime, rt_period; - - rt_period = rt_period_us * NSEC_PER_USEC; - rt_runtime = tg->rt_bandwidth.rt_runtime; - - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); -} - -static long sched_group_rt_period(struct task_group *tg) -{ - u64 rt_period_us; - - rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); - do_div(rt_period_us, NSEC_PER_USEC); - return rt_period_us; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static int sched_rt_global_constraints(void) -{ - int ret = 0; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return ret; -} - -static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) -{ - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) - return 0; - - return 1; -} - -#else /* !CONFIG_RT_GROUP_SCHED */ -static int sched_rt_global_constraints(void) -{ - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - - return 0; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -static int sched_rt_global_validate(void) -{ - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) - return -EINVAL; - - return 0; -} - -static void sched_rt_do_global(void) -{ - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); -} - -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int old_period, old_runtime; - static DEFINE_MUTEX(mutex); - int ret; - - mutex_lock(&mutex); - old_period = sysctl_sched_rt_period; - old_runtime = sysctl_sched_rt_runtime; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (!ret && write) { - ret = sched_rt_global_validate(); - if (ret) - goto undo; - - ret = sched_dl_global_validate(); - if (ret) - goto undo; - - ret = sched_rt_global_constraints(); - if (ret) - goto undo; - - sched_rt_do_global(); - sched_dl_do_global(); - } - if (0) { -undo: - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } - mutex_unlock(&mutex); - - return ret; -} - -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); - /* - * Make sure that internally we keep jiffies. - * Also, writing zero resets the timeslice to default: - */ - if (!ret && write) { - sched_rr_timeslice = - sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : - msecs_to_jiffies(sysctl_sched_rr_timeslice); - } - mutex_unlock(&mutex); - return ret; -} - -#ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 581d5c7a5264..45caf937ef90 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2449,6 +2449,316 @@ const struct sched_class rt_sched_class = { .update_curr = update_curr_rt, }; +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + + for_each_process_thread(g, p) { + if (rt_task(p) && task_group(p) == tg) + return 1; + } + + return 0; +} + +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; + +static int tg_rt_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); + } + + if (sum > total) + return -EINVAL; + + return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + int ret; + + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; +} + +static int tg_set_rt_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; + + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) + goto unlock; + + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock: + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_bandwidth.rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return ret; +} + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static int sched_rt_global_validate(void) +{ + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + return -EINVAL; + + return 0; +} + +static void sched_rt_do_global(void) +{ + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + + ret = sched_dl_global_validate(); + if (ret) + goto undo; + + ret = sched_rt_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } + mutex_unlock(&mutex); + + return ret; +} + +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* + * Make sure that internally we keep jiffies. + * Also, writing zero resets the timeslice to default: + */ + if (!ret && write) { + sched_rr_timeslice = + sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : + msecs_to_jiffies(sysctl_sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + #ifdef CONFIG_SCHED_DEBUG extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d4eb3f67529d..eeef1a3086d1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -383,6 +383,11 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_online_group(struct task_group *tg, -- cgit v1.2.3 From 239946314e57711d7da546b67964d0b387a3ee42 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jun 2017 15:07:39 -0700 Subject: bpf: possibly avoid extra masking for narrower load in verifier Commit 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") permits narrower load for certain ctx fields. The commit however will already generate a masking even if the prog-specific ctx conversion produces the result with narrower size. For example, for __sk_buff->protocol, the ctx conversion loads the data into register with 2-byte load. A narrower 2-byte load should not generate masking. For __sk_buff->vlan_present, the conversion function set the result as either 0 or 1, essentially a byte. The narrower 2-byte or 1-byte load should not generate masking. To avoid unnecessary masking, prog-specific *_is_valid_access now passes converted_op_size back to verifier, which indicates the valid data width after perceived future conversion. Based on this information, verifier is able to avoid unnecessary marking. Since we want more information back from prog-specific *_is_valid_access checking, all of them are packed into one data structure for more clarity. Acked-by: Daniel Borkmann Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 29 +++++++++++++++++++++-------- kernel/trace/bpf_trace.c | 17 +++++++++++------ 2 files changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 44b97d958fb7..74ea96ea391b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -761,22 +761,34 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { - int ctx_field_size = 0; + struct bpf_insn_access_aux info = { .reg_type = *reg_type }; /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) return 0; if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) { - /* a non zero ctx_field_size indicates: + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + /* a non zero info.ctx_field_size indicates: * . For this field, the prog type specific ctx conversion algorithm * only supports whole field access. * . This ctx access is a candiate for later verifier transformation * to load the whole field and then apply a mask to get correct result. + * a non zero info.converted_op_size indicates perceived actual converted + * value width in convert_ctx_access. */ - if (ctx_field_size) - env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size; + if ((info.ctx_field_size && !info.converted_op_size) || + (!info.ctx_field_size && info.converted_op_size)) { + verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n", + env->prog->type, off, size); + return -EACCES; + } + + if (info.ctx_field_size) { + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size; + } + *reg_type = info.reg_type; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -3388,7 +3400,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0; + int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3431,7 +3443,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) off = insn->off; size = bpf_size_to_bytes(BPF_SIZE(insn->code)); ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; - is_narrower_load = (type == BPF_READ && size < ctx_field_size); + converted_op_size = env->insn_aux_data[i + delta].converted_op_size; + is_narrower_load = type == BPF_READ && size < ctx_field_size; /* If the read access is a narrower load of the field, * convert to a 4/8-byte load, to minimum program type specific @@ -3453,7 +3466,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - if (is_narrower_load) { + if (is_narrower_load && size < converted_op_size) { if (ctx_field_size <= 4) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9d3ec8253131..97c46b440cd6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -479,7 +479,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; @@ -562,7 +562,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; @@ -581,7 +581,7 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { int sample_period_off; @@ -595,12 +595,17 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { - *ctx_field_size = 8; + int allowed; + #ifdef __LITTLE_ENDIAN - return (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; + allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; #else - return ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; + allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; #endif + if (!allowed) + return false; + info->ctx_field_size = 8; + info->converted_op_size = 8; } else { if (size != sizeof(long)) return false; -- cgit v1.2.3 From 739294fb03f590401bbd7faa6d31a507e3ffada5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:27 -0400 Subject: sched/numa: Override part of migrate_degrades_locality() when idle balancing Several tests in the NAS benchmark seem to run a lot slower with NUMA balancing enabled, than with NUMA balancing disabled. The slower run time corresponds with increased idle time. Overriding the final test of migrate_degrades_locality (but still doing the other NUMA tests first) seems to improve performance of those benchmarks. Reported-by: Jirka Hladky Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-2-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 694c258b8771..6e0c0524131e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6688,6 +6688,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (dst_nid == p->numa_preferred_nid) return 0; + /* Leaving a core idle is often worse than degrading locality. */ + if (env->idle != CPU_NOT_IDLE) + return -1; + if (numa_group) { src_faults = group_faults(p, src_nid); dst_faults = group_faults(p, dst_nid); -- cgit v1.2.3 From 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:28 -0400 Subject: sched/fair: Simplify wake_affine() for the single socket case Then 'this_cpu' and 'prev_cpu' are in the same socket, select_idle_sibling() will do its thing regardless of the return value of wake_affine(). Just return true and don't look at all the other things. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-3-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e0c0524131e..fe1901686fa5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5419,6 +5419,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); + /* + * Common case: CPUs are in the same socket, and select_idle_sibling() + * will do its thing regardless of what we return: + */ + if (cpus_share_cache(prev_cpu, this_cpu)) + return true; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -6007,11 +6014,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) + if (cpu == prev_cpu) + goto pick_cpu; + + if (wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { + pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); -- cgit v1.2.3 From 3fed382b46baac83703130fe4cd3d9147f427fb9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:29 -0400 Subject: sched/numa: Implement NUMA node level wake_affine() Since select_idle_sibling() can place a task anywhere on a socket, comparing loads between individual CPU cores makes no real sense for deciding whether to do an affine wakeup across sockets, either. Instead, compare the load between the sockets in a similar way the load balancer and the numa balancing code do. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe1901686fa5..79ac078caf5d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } } + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + */ +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + struct numa_stats prev_load, this_load; + s64 this_eff_load, prev_eff_load; + + update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); + update_numa_stats(&this_load, cpu_to_node(this_cpu)); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + if (this_load.load > current_load) + this_load.load -= current_load; + else + this_load.load = 0; + } + + /* + * In low-load situations, where this_cpu's node is idle due to the + * sync cause above having dropped this_load.load to 0, move the task. + * Moving to an idle socket will not create a bad imbalance. + * + * Otherwise check if the nodes are near enough in load to allow this + * task to be woken on this_cpu's node. + */ + if (this_load.load > 0) { + unsigned long task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_load.compute_capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_load.compute_capacity; + + this_eff_load *= this_load.load + task_load; + prev_eff_load *= prev_load.load - task_load; + + return this_eff_load <= prev_eff_load; + } + + return true; +} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } + +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + return true; +} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { - s64 this_load, load; - s64 this_eff_load, prev_eff_load; - int idx, this_cpu; - struct task_group *tg; - unsigned long weight; - int balanced; - - idx = sd->wake_idx; - this_cpu = smp_processor_id(); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + int this_cpu = smp_processor_id(); + bool affine = false; /* * Common case: CPUs are in the same socket, and select_idle_sibling() * will do its thing regardless of what we return: */ if (cpus_share_cache(prev_cpu, this_cpu)) - return true; - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - tg = task_group(current); - weight = current->se.avg.load_avg; - - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); - } - - tg = task_group(p); - weight = p->se.avg.load_avg; - - /* - * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped this_load to 0, we'll - * always have an imbalance, but there's really nothing you can do - * about that, so that's good too. - * - * Otherwise check if either cpus are near enough in load to allow this - * task to be woken on this_cpu. - */ - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); - - if (this_load > 0) { - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); - - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); - } - - balanced = this_eff_load <= prev_eff_load; + affine = true; + else + affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + if (affine) { + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + } - if (!balanced) - return 0; - - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - - return 1; + return affine; } static inline int task_util(struct task_struct *p); -- cgit v1.2.3 From 815abf5af45f04f759f12f3172afd15226fd7f71 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:30 -0400 Subject: sched/fair: Remove effective_load() The effective_load() function was only used by the NUMA balancing code, and not by the regular load balancing code. Now that the NUMA balancing code no longer uses it either, get rid of it. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-5-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 124 +--------------------------------------------------- 1 file changed, 1 insertion(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79ac078caf5d..6f4f155adf5f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1382,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); -static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -3045,8 +3044,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) * differential update where we store the last value we propagated. This in * turn allows skipping updates if the differential is 'small'. * - * Updating tg's load_avg is necessary before update_cfs_share() (which is - * done) and effective_load() (which is not done because it is too costly). + * Updating tg's load_avg is necessary before update_cfs_share(). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -5298,126 +5296,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * effective_load() calculates the load change as seen from the root_task_group - * - * Adding load to a group doesn't make a group heavier, but can cause movement - * of group shares between cpus. Assuming the shares were perfectly aligned one - * can calculate the shift in shares. - * - * Calculate the effective load difference if @wl is added (subtracted) to @tg - * on this @cpu and results in a total addition (subtraction) of @wg to the - * total group weight. - * - * Given a runqueue weight distribution (rw_i) we can compute a shares - * distribution (s_i) using: - * - * s_i = rw_i / \Sum rw_j (1) - * - * Suppose we have 4 CPUs and our @tg is a direct child of the root group and - * has 7 equal weight tasks, distributed as below (rw_i), with the resulting - * shares distribution (s_i): - * - * rw_i = { 2, 4, 1, 0 } - * s_i = { 2/7, 4/7, 1/7, 0 } - * - * As per wake_affine() we're interested in the load of two CPUs (the CPU the - * task used to run on and the CPU the waker is running on), we need to - * compute the effect of waking a task on either CPU and, in case of a sync - * wakeup, compute the effect of the current task going to sleep. - * - * So for a change of @wl to the local @cpu with an overall group weight change - * of @wl we can compute the new shares distribution (s'_i) using: - * - * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) - * - * Suppose we're interested in CPUs 0 and 1, and want to compute the load - * differences in waking a task to CPU 0. The additional task changes the - * weight and shares distributions like: - * - * rw'_i = { 3, 4, 1, 0 } - * s'_i = { 3/8, 4/8, 1/8, 0 } - * - * We can then compute the difference in effective weight by using: - * - * dw_i = S * (s'_i - s_i) (3) - * - * Where 'S' is the group weight as seen by its parent. - * - * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) - * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - - * 4/7) times the weight of the group. - */ -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - struct sched_entity *se = tg->se[cpu]; - - if (!tg->parent) /* the trivial, non-cgroup case */ - return wl; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = se->my_q; - long W, w = cfs_rq_load_avg(cfs_rq); - - tg = cfs_rq->tg; - - /* - * W = @wg + \Sum rw_j - */ - W = wg + atomic_long_read(&tg->load_avg); - - /* Ensure \Sum rw_j >= rw_i */ - W -= cfs_rq->tg_load_avg_contrib; - W += w; - - /* - * w = rw_i + @wl - */ - w += wl; - - /* - * wl = S * s'_i; see (2) - */ - if (W > 0 && w < W) - wl = (w * (long)scale_load_down(tg->shares)) / W; - else - wl = scale_load_down(tg->shares); - - /* - * Per the above, wl is the new se->load.weight value; since - * those are clipped to [MIN_SHARES, ...) do so now. See - * calc_cfs_shares(). - */ - if (wl < MIN_SHARES) - wl = MIN_SHARES; - - /* - * wl = dw_i = S * (s'_i - s_i); see (3) - */ - wl -= se->avg.load_avg; - - /* - * Recursively apply this logic to all parent groups to compute - * the final effective load change on the root group. Since - * only the @tg group gets extra weight, all parent groups can - * only redistribute existing shares. @wl is the shift in shares - * resulting from this level per the above. - */ - wg = 0; - } - - return wl; -} -#else - -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - return wl; -} - -#endif - static void record_wakee(struct task_struct *p) { /* -- cgit v1.2.3 From c2ce34c0a0e5187195ecade872be950d2611ba68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Jun 2017 11:05:59 +0200 Subject: genirq/debugfs: Remove pointless NULL pointer check debugfs_remove() has it's own NULL pointer check. Remove the conditional and make irq_remove_debugfs_entry() an inline helper Reported-by: kbuild test robot Signed-off-by: Thomas Gleixner --- kernel/irq/debugfs.c | 7 ------- kernel/irq/internals.h | 7 ++++++- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index dbd6e78db213..4d384edc0c64 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -3,7 +3,6 @@ * * This file is licensed under the GPL V2. */ -#include #include #include @@ -191,12 +190,6 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) &dfs_irq_ops); } -void irq_remove_debugfs_entry(struct irq_desc *desc) -{ - if (desc->debugfs_file) - debugfs_remove(desc->debugfs_file); -} - static int __init irq_debugfs_init(void) { struct dentry *root_dir; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5fd105e252c3..a573e0771baf 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -318,8 +318,13 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS +#include + void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); -void irq_remove_debugfs_entry(struct irq_desc *desc); +static inline void irq_remove_debugfs_entry(struct irq_desc *desc) +{ + debugfs_remove(desc->debugfs_file); +} # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else -- cgit v1.2.3 From b2d3d61adb7b73cfe5f82404f7a130a76fc64232 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 23 Jun 2017 16:11:07 +0200 Subject: genirq/timings: Add infrastructure to track the interrupt timings The interrupt framework gives a lot of information about each interrupt. It does not keep track of when those interrupts occur though, which is a prerequisite for estimating the next interrupt arrival for power management purposes. Add a mechanism to record the timestamp for each interrupt occurrences in a per-CPU circular buffer to help with the prediction of the next occurrence using a statistical model. Each CPU can store up to IRQ_TIMINGS_SIZE events , the current value of IRQ_TIMINGS_SIZE is 32. Each event is encoded into a single u64, where the high 48 bits are used for the timestamp and the low 16 bits are for the irq number. A static key is introduced so when the irq prediction is switched off at runtime, the overhead is near to zero. It results in most of the code in internals.h for inline reasons and a very few in the new file timings.c. The latter will contain more in the next patch which will provide the statistical model for the next event prediction. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Acked-by: Nicolas Pitre Cc: Jens Axboe Cc: Hannes Reinecke Cc: Vincent Guittot Cc: "Rafael J . Wysocki" Cc: Peter Zijlstra Cc: Bjorn Helgaas Link: http://lkml.kernel.org/r/1498227072-5980-1-git-send-email-daniel.lezcano@linaro.org --- kernel/irq/Kconfig | 3 ++ kernel/irq/Makefile | 1 + kernel/irq/handle.c | 2 ++ kernel/irq/internals.h | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/manage.c | 3 ++ kernel/irq/timings.c | 30 +++++++++++++++++ 6 files changed, 129 insertions(+) create mode 100644 kernel/irq/timings.c (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index fcbb1d6d51cb..27c4e774071c 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -85,6 +85,9 @@ config GENERIC_MSI_IRQ_DOMAIN config HANDLE_DOMAIN_IRQ bool +config IRQ_TIMINGS + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index c61fc9c2d1f7..e4aef7351f2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,6 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o +obj-$(CONFIG_IRQ_TIMINGS) += timings.o obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d3f24905852c..eb4d3e8945b8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -138,6 +138,8 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags unsigned int irq = desc->irq_data.irq; struct irqaction *action; + record_irq_time(desc); + for_each_action_of_desc(desc, action) { irqreturn_t res; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a573e0771baf..b95b74920433 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) @@ -57,6 +58,7 @@ enum { IRQS_WAITING = 0x00000080, IRQS_PENDING = 0x00000200, IRQS_SUSPENDED = 0x00000800, + IRQS_TIMINGS = 0x00001000, }; #include "debug.h" @@ -255,6 +257,94 @@ static inline void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } #endif +#ifdef CONFIG_IRQ_TIMINGS + +#define IRQ_TIMINGS_SHIFT 5 +#define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT) +#define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1) + +/** + * struct irq_timings - irq timings storing structure + * @values: a circular buffer of u64 encoded values + * @count: the number of elements in the array + */ +struct irq_timings { + u64 values[IRQ_TIMINGS_SIZE]; + int count; +}; + +DECLARE_PER_CPU(struct irq_timings, irq_timings); + +static inline void irq_remove_timings(struct irq_desc *desc) +{ + desc->istate &= ~IRQS_TIMINGS; +} + +static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) +{ + /* + * We don't need the measurement because the idle code already + * knows the next expiry event. + */ + if (act->flags & __IRQF_TIMER) + return; + + desc->istate |= IRQS_TIMINGS; +} + +extern void irq_timings_enable(void); +extern void irq_timings_disable(void); + +DECLARE_STATIC_KEY_FALSE(irq_timing_enabled); + +/* + * The interrupt number and the timestamp are encoded into a single + * u64 variable to optimize the size. + * 48 bit time stamp and 16 bit IRQ number is way sufficient. + * Who cares an IRQ after 78 hours of idle time? + */ +static inline u64 irq_timing_encode(u64 timestamp, int irq) +{ + return (timestamp << 16) | irq; +} + +static inline int irq_timing_decode(u64 value, u64 *timestamp) +{ + *timestamp = value >> 16; + return value & U16_MAX; +} + +/* + * The function record_irq_time is only called in one place in the + * interrupts handler. We want this function always inline so the code + * inside is embedded in the function and the static key branching + * code can act at the higher level. Without the explicit + * __always_inline we can end up with a function call and a small + * overhead in the hotpath for nothing. + */ +static __always_inline void record_irq_time(struct irq_desc *desc) +{ + if (!static_branch_likely(&irq_timing_enabled)) + return; + + if (desc->istate & IRQS_TIMINGS) { + struct irq_timings *timings = this_cpu_ptr(&irq_timings); + + timings->values[timings->count & IRQ_TIMINGS_MASK] = + irq_timing_encode(local_clock(), + irq_desc_get_irq(desc)); + + timings->count++; + } +} +#else +static inline void irq_remove_timings(struct irq_desc *desc) {} +static inline void irq_setup_timings(struct irq_desc *desc, + struct irqaction *act) {}; +static inline void record_irq_time(struct irq_desc *desc) {} +#endif /* CONFIG_IRQ_TIMINGS */ + + #ifdef CONFIG_GENERIC_IRQ_CHIP void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3577c091ac7b..5c11c1730ba5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1348,6 +1348,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_setup_timings(desc, new); + /* * Strictly no need to wake it up, but hung_task complains * when no hard interrupt wakes the thread up. @@ -1474,6 +1476,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); irq_release_resources(desc); + irq_remove_timings(desc); } #ifdef CONFIG_SMP diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c new file mode 100644 index 000000000000..56cf6870fa26 --- /dev/null +++ b/kernel/irq/timings.c @@ -0,0 +1,30 @@ +/* + * linux/kernel/irq/timings.c + * + * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include + +#include "internals.h" + +DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); + +DEFINE_PER_CPU(struct irq_timings, irq_timings); + +void irq_timings_enable(void) +{ + static_branch_enable(&irq_timing_enabled); +} + +void irq_timings_disable(void) +{ + static_branch_disable(&irq_timing_enabled); +} -- cgit v1.2.3 From e1c921495534002d727b15a76a2f8c20b6b108b5 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 23 Jun 2017 16:11:08 +0200 Subject: genirq/timings: Add infrastructure for estimating the next interrupt arrival time An interrupt behaves with a burst of activity with periodic interval of time followed by one or two peaks of longer interval. As the time intervals are periodic, statistically speaking they follow a normal distribution and each interrupts can be tracked individually. Add a mechanism to compute the statistics on all interrupts, except the timers which are deterministic from a prediction point of view, as their expiry time is known. The goal is to extract the periodicity for each interrupt, with the last timestamp and sum them, so the next event can be predicted to a certain extent. Taking the earliest prediction gives the expected wakeup on the system (assuming a timer won't expire before). Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: Nicolas Pitre Cc: Jens Axboe Cc: Hannes Reinecke Cc: Vincent Guittot Cc: "Rafael J . Wysocki" Cc: Peter Zijlstra Cc: Bjorn Helgaas Link: http://lkml.kernel.org/r/1498227072-5980-2-git-send-email-daniel.lezcano@linaro.org --- kernel/irq/internals.h | 19 +++ kernel/irq/timings.c | 339 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 358 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b95b74920433..9da14d125df4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -275,13 +275,21 @@ struct irq_timings { DECLARE_PER_CPU(struct irq_timings, irq_timings); +extern void irq_timings_free(int irq); +extern int irq_timings_alloc(int irq); + static inline void irq_remove_timings(struct irq_desc *desc) { desc->istate &= ~IRQS_TIMINGS; + + irq_timings_free(irq_desc_get_irq(desc)); } static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) { + int irq = irq_desc_get_irq(desc); + int ret; + /* * We don't need the measurement because the idle code already * knows the next expiry event. @@ -289,6 +297,17 @@ static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *ac if (act->flags & __IRQF_TIMER) return; + /* + * In case the timing allocation fails, we just want to warn, + * not fail, so letting the system boot anyway. + */ + ret = irq_timings_alloc(irq); + if (ret) { + pr_warn("Failed to allocate irq timing stats for irq%d (%d)", + irq, ret); + return; + } + desc->istate |= IRQS_TIMINGS; } diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 56cf6870fa26..c8c1d073fbf1 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -8,10 +8,16 @@ * published by the Free Software Foundation. * */ +#include #include +#include #include #include +#include #include +#include + +#include #include "internals.h" @@ -19,6 +25,18 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); DEFINE_PER_CPU(struct irq_timings, irq_timings); +struct irqt_stat { + u64 next_evt; + u64 last_ts; + u64 variance; + u32 avg; + u32 nr_samples; + int anomalies; + int valid; +}; + +static DEFINE_IDR(irqt_stats); + void irq_timings_enable(void) { static_branch_enable(&irq_timing_enabled); @@ -28,3 +46,324 @@ void irq_timings_disable(void) { static_branch_disable(&irq_timing_enabled); } + +/** + * irqs_update - update the irq timing statistics with a new timestamp + * + * @irqs: an irqt_stat struct pointer + * @ts: the new timestamp + * + * The statistics are computed online, in other words, the code is + * designed to compute the statistics on a stream of values rather + * than doing multiple passes on the values to compute the average, + * then the variance. The integer division introduces a loss of + * precision but with an acceptable error margin regarding the results + * we would have with the double floating precision: we are dealing + * with nanosec, so big numbers, consequently the mantisse is + * negligeable, especially when converting the time in usec + * afterwards. + * + * The computation happens at idle time. When the CPU is not idle, the + * interrupts' timestamps are stored in the circular buffer, when the + * CPU goes idle and this routine is called, all the buffer's values + * are injected in the statistical model continuying to extend the + * statistics from the previous busy-idle cycle. + * + * The observations showed a device will trigger a burst of periodic + * interrupts followed by one or two peaks of longer time, for + * instance when a SD card device flushes its cache, then the periodic + * intervals occur again. A one second inactivity period resets the + * stats, that gives us the certitude the statistical values won't + * exceed 1x10^9, thus the computation won't overflow. + * + * Basically, the purpose of the algorithm is to watch the periodic + * interrupts and eliminate the peaks. + * + * An interrupt is considered periodically stable if the interval of + * its occurences follow the normal distribution, thus the values + * comply with: + * + * avg - 3 x stddev < value < avg + 3 x stddev + * + * Which can be simplified to: + * + * -3 x stddev < value - avg < 3 x stddev + * + * abs(value - avg) < 3 x stddev + * + * In order to save a costly square root computation, we use the + * variance. For the record, stddev = sqrt(variance). The equation + * above becomes: + * + * abs(value - avg) < 3 x sqrt(variance) + * + * And finally we square it: + * + * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 + * + * (value - avg) x (value - avg) < 9 x variance + * + * Statistically speaking, any values out of this interval is + * considered as an anomaly and is discarded. However, a normal + * distribution appears when the number of samples is 30 (it is the + * rule of thumb in statistics, cf. "30 samples" on Internet). When + * there are three consecutive anomalies, the statistics are resetted. + * + */ +static void irqs_update(struct irqt_stat *irqs, u64 ts) +{ + u64 old_ts = irqs->last_ts; + u64 variance = 0; + u64 interval; + s64 diff; + + /* + * The timestamps are absolute time values, we need to compute + * the timing interval between two interrupts. + */ + irqs->last_ts = ts; + + /* + * The interval type is u64 in order to deal with the same + * type in our computation, that prevent mindfuck issues with + * overflow, sign and division. + */ + interval = ts - old_ts; + + /* + * The interrupt triggered more than one second apart, that + * ends the sequence as predictible for our purpose. In this + * case, assume we have the beginning of a sequence and the + * timestamp is the first value. As it is impossible to + * predict anything at this point, return. + * + * Note the first timestamp of the sequence will always fall + * in this test because the old_ts is zero. That is what we + * want as we need another timestamp to compute an interval. + */ + if (interval >= NSEC_PER_SEC) { + memset(irqs, 0, sizeof(*irqs)); + irqs->last_ts = ts; + return; + } + + /* + * Pre-compute the delta with the average as the result is + * used several times in this function. + */ + diff = interval - irqs->avg; + + /* + * Increment the number of samples. + */ + irqs->nr_samples++; + + /* + * Online variance divided by the number of elements if there + * is more than one sample. Normally the formula is division + * by nr_samples - 1 but we assume the number of element will be + * more than 32 and dividing by 32 instead of 31 is enough + * precise. + */ + if (likely(irqs->nr_samples > 1)) + variance = irqs->variance >> IRQ_TIMINGS_SHIFT; + + /* + * The rule of thumb in statistics for the normal distribution + * is having at least 30 samples in order to have the model to + * apply. Values outside the interval are considered as an + * anomaly. + */ + if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { + /* + * After three consecutive anomalies, we reset the + * stats as it is no longer stable enough. + */ + if (irqs->anomalies++ >= 3) { + memset(irqs, 0, sizeof(*irqs)); + irqs->last_ts = ts; + return; + } + } else { + /* + * The anomalies must be consecutives, so at this + * point, we reset the anomalies counter. + */ + irqs->anomalies = 0; + } + + /* + * The interrupt is considered stable enough to try to predict + * the next event on it. + */ + irqs->valid = 1; + + /* + * Online average algorithm: + * + * new_average = average + ((value - average) / count) + * + * The variance computation depends on the new average + * to be computed here first. + * + */ + irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); + + /* + * Online variance algorithm: + * + * new_variance = variance + (value - average) x (value - new_average) + * + * Warning: irqs->avg is updated with the line above, hence + * 'interval - irqs->avg' is no longer equal to 'diff' + */ + irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); + + /* + * Update the next event + */ + irqs->next_evt = ts + irqs->avg; +} + +/** + * irq_timings_next_event - Return when the next event is supposed to arrive + * + * During the last busy cycle, the number of interrupts is incremented + * and stored in the irq_timings structure. This information is + * necessary to: + * + * - know if the index in the table wrapped up: + * + * If more than the array size interrupts happened during the + * last busy/idle cycle, the index wrapped up and we have to + * begin with the next element in the array which is the last one + * in the sequence, otherwise it is a the index 0. + * + * - have an indication of the interrupts activity on this CPU + * (eg. irq/sec) + * + * The values are 'consumed' after inserting in the statistical model, + * thus the count is reinitialized. + * + * The array of values **must** be browsed in the time direction, the + * timestamp must increase between an element and the next one. + * + * Returns a nanosec time based estimation of the earliest interrupt, + * U64_MAX otherwise. + */ +u64 irq_timings_next_event(u64 now) +{ + struct irq_timings *irqts = this_cpu_ptr(&irq_timings); + struct irqt_stat *irqs; + struct irqt_stat __percpu *s; + u64 ts, next_evt = U64_MAX; + int i, irq = 0; + + /* + * This function must be called with the local irq disabled in + * order to prevent the timings circular buffer to be updated + * while we are reading it. + */ + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Number of elements in the circular buffer: If it happens it + * was flushed before, then the number of elements could be + * smaller than IRQ_TIMINGS_SIZE, so the count is used, + * otherwise the array size is used as we wrapped. The index + * begins from zero when we did not wrap. That could be done + * in a nicer way with the proper circular array structure + * type but with the cost of extra computation in the + * interrupt handler hot path. We choose efficiency. + * + * Inject measured irq/timestamp to the statistical model + * while decrementing the counter because we consume the data + * from our circular buffer. + */ + for (i = irqts->count & IRQ_TIMINGS_MASK, + irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); + irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { + + irq = irq_timing_decode(irqts->values[i], &ts); + + s = idr_find(&irqt_stats, irq); + if (s) { + irqs = this_cpu_ptr(s); + irqs_update(irqs, ts); + } + } + + /* + * Look in the list of interrupts' statistics, the earliest + * next event. + */ + idr_for_each_entry(&irqt_stats, s, i) { + + irqs = this_cpu_ptr(s); + + if (!irqs->valid) + continue; + + if (irqs->next_evt <= now) { + irq = i; + next_evt = now; + + /* + * This interrupt mustn't use in the future + * until new events occur and update the + * statistics. + */ + irqs->valid = 0; + break; + } + + if (irqs->next_evt < next_evt) { + irq = i; + next_evt = irqs->next_evt; + } + } + + return next_evt; +} + +void irq_timings_free(int irq) +{ + struct irqt_stat __percpu *s; + + s = idr_find(&irqt_stats, irq); + if (s) { + free_percpu(s); + idr_remove(&irqt_stats, irq); + } +} + +int irq_timings_alloc(int irq) +{ + struct irqt_stat __percpu *s; + int id; + + /* + * Some platforms can have the same private interrupt per cpu, + * so this function may be be called several times with the + * same interrupt number. Just bail out in case the per cpu + * stat structure is already allocated. + */ + s = idr_find(&irqt_stats, irq); + if (s) + return 0; + + s = alloc_percpu(*s); + if (!s) + return -ENOMEM; + + idr_preload(GFP_KERNEL); + id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT); + idr_preload_end(); + + if (id < 0) { + free_percpu(s); + return id; + } + + return 0; +} -- cgit v1.2.3 From f59dd9c886acb3abb188e8e94a99436560976835 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:02 -0700 Subject: time: add get_timespec64 and put_timespec64 Add helper functions to convert between struct timespec64 and struct timespec at userspace boundaries. This is a preparatory patch to use timespec64 as the basic type internally in the kernel as timespec is not y2038 safe on 32 bit systems. The patch helps the cause by containing all data conversions at the userspace boundaries within these functions. Suggested-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/compat.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/time.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index ebd8bdc3fd68..73f26ba44a8a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +static int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts) +{ + struct compat_timespec ts; + int ret; + + ret = copy_from_user(&ts, cts, sizeof(ts)); + if (ret) + return -EFAULT; + + ts64->tv_sec = ts.tv_sec; + ts64->tv_nsec = ts.tv_nsec; + + return 0; +} + +static int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts) +{ + struct compat_timespec ts = { + .tv_sec = ts64->tv_sec, + .tv_nsec = ts64->tv_nsec + }; + return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; +} + +int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_get_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec64); + +int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_put_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec64); + int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) diff --git a/kernel/time/time.c b/kernel/time/time.c index 7c89e437c4d7..adb9853ca6b0 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -890,3 +890,31 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } + +int get_timespec64(struct timespec64 *ts, + const struct timespec __user *uts) +{ + struct timespec kts; + int ret; + + ret = copy_from_user(&kts, uts, sizeof(kts)); + if (ret) + return -EFAULT; + + ts->tv_sec = kts.tv_sec; + ts->tv_nsec = kts.tv_nsec; + + return 0; +} +EXPORT_SYMBOL_GPL(get_timespec64); + +int put_timespec64(const struct timespec64 *ts, + struct timespec __user *uts) +{ + struct timespec kts = { + .tv_sec = ts->tv_sec, + .tv_nsec = ts->tv_nsec + }; + return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_timespec64); -- cgit v1.2.3 From d5b7ffbfbdacc29e4db035f90665951668fa9c58 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:03 -0700 Subject: time: introduce {get,put}_itimerspec64 As we change the user space type for the timerfd and posix timer functions to newer data types, we need some form of conversion helpers to avoid duplicating that logic. Suggested-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/compat.c | 21 +++++++++++++++++++++ kernel/time/time.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 73f26ba44a8a..a350deda503a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -586,6 +586,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, return 0; } +int get_compat_itimerspec64(struct itimerspec64 *its, + const struct compat_itimerspec __user *uits) +{ + + if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || + __compat_get_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(get_compat_itimerspec64); + +int put_compat_itimerspec64(const struct itimerspec64 *its, + struct compat_itimerspec __user *uits) +{ + if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || + __compat_put_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(put_compat_itimerspec64); + /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/time.c b/kernel/time/time.c index adb9853ca6b0..44a8c1402133 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -918,3 +918,33 @@ int put_timespec64(const struct timespec64 *ts, return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); + +int get_itimerspec64(struct itimerspec64 *it, + const struct itimerspec __user *uit) +{ + int ret; + + ret = get_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = get_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(get_itimerspec64); + +int put_itimerspec64(const struct itimerspec64 *it, + struct itimerspec __user *uit) +{ + int ret; + + ret = put_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = put_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(put_itimerspec64); -- cgit v1.2.3 From 63a766a1780f9581e8885bdb64270a594a84f81a Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:04 -0700 Subject: posix-stubs: Conditionally include COMPAT_SYS_NI defines These apis only need to be defined if CONFIG_COMPAT is enabled. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-stubs.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 38f3b20efa29..65878221cbfb 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -41,12 +41,6 @@ SYS_NI(setitimer); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif -COMPAT_SYS_NI(timer_create); -COMPAT_SYS_NI(clock_adjtime); -COMPAT_SYS_NI(timer_settime); -COMPAT_SYS_NI(timer_gettime); -COMPAT_SYS_NI(getitimer); -COMPAT_SYS_NI(setitimer); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC @@ -138,6 +132,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, } #ifdef CONFIG_COMPAT +COMPAT_SYS_NI(timer_create); +COMPAT_SYS_NI(clock_adjtime); +COMPAT_SYS_NI(timer_settime); +COMPAT_SYS_NI(timer_gettime); +COMPAT_SYS_NI(getitimer); +COMPAT_SYS_NI(setitimer); + COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, struct compat_timespec __user *, tp) { -- cgit v1.2.3 From d829b8fb2431595422289cfc210f0a955a8bec74 Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Mon, 26 Jun 2017 19:33:33 +0800 Subject: genirq: Set irq masked state when initializing irq_desc The irq default state is set to disabled when allocating irq desc, but the masked state flag is not set. This is inconsistent vs. the state tracking logic which is used to prevent unnecessary calls to hardware level irq chip functions. Set the masked state flag as well. Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: tfiga@chromium.org Cc: briannorris@chromium.org Cc: dianders@chromium.org Link: http://lkml.kernel.org/r/1498476814-12563-1-git-send-email-jeffy.chen@rock-chips.com --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 35a95fadcfda..948b50e78549 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -116,6 +116,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->irq_data.chip_data = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; -- cgit v1.2.3 From bf22ff45bed664aefb5c4e43029057a199b7070c Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Mon, 26 Jun 2017 19:33:34 +0800 Subject: genirq: Avoid unnecessary low level irq function calls Check irq state in enable/disable/unmask/mask_irq to avoid unnecessary low level irq function calls. This has two advantages: - Conditionals are faster than hardware access - Solves issues with the underlying refcounting of the pinctrl infrastructure Suggested-by: Thomas Gleixner Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: tfiga@chromium.org Cc: briannorris@chromium.org Cc: dianders@chromium.org Link: http://lkml.kernel.org/r/1498476814-12563-2-git-send-email-jeffy.chen@rock-chips.com --- kernel/irq/chip.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc89eeb8a6b4..2e30d925a40d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -314,22 +314,32 @@ void irq_shutdown(struct irq_desc *desc) void irq_enable(struct irq_desc *desc) { - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) - desc->irq_data.chip->irq_enable(&desc->irq_data); - else - desc->irq_data.chip->irq_unmask(&desc->irq_data); - irq_state_clr_masked(desc); + if (!irqd_irq_disabled(&desc->irq_data)) { + unmask_irq(desc); + } else { + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) { + desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + unmask_irq(desc); + } + } } static void __irq_disable(struct irq_desc *desc, bool mask) { - irq_state_set_disabled(desc); - if (desc->irq_data.chip->irq_disable) { - desc->irq_data.chip->irq_disable(&desc->irq_data); - irq_state_set_masked(desc); - } else if (mask) { - mask_irq(desc); + if (irqd_irq_disabled(&desc->irq_data)) { + if (mask) + mask_irq(desc); + } else { + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } else if (mask) { + mask_irq(desc); + } } } @@ -378,18 +388,21 @@ void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) static inline void mask_ack_irq(struct irq_desc *desc) { - if (desc->irq_data.chip->irq_mask_ack) + if (desc->irq_data.chip->irq_mask_ack) { desc->irq_data.chip->irq_mask_ack(&desc->irq_data); - else { - desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_state_set_masked(desc); + } else { + mask_irq(desc); if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } - irq_state_set_masked(desc); } void mask_irq(struct irq_desc *desc) { + if (irqd_irq_masked(&desc->irq_data)) + return; + if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); irq_state_set_masked(desc); @@ -398,6 +411,9 @@ void mask_irq(struct irq_desc *desc) void unmask_irq(struct irq_desc *desc) { + if (!irqd_irq_masked(&desc->irq_data)) + return; + if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); irq_state_clr_masked(desc); @@ -411,10 +427,7 @@ void unmask_threaded_irq(struct irq_desc *desc) if (chip->flags & IRQCHIP_EOI_THREADED) chip->irq_eoi(&desc->irq_data); - if (chip->irq_unmask) { - chip->irq_unmask(&desc->irq_data); - irq_state_clr_masked(desc); - } + unmask_irq(desc); } /* -- cgit v1.2.3 From 1ba5c08b58a0c21fca222f1bf2fde184aa26103f Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 6 Jun 2017 14:17:39 +0200 Subject: kernel/module.c: suppress warning about unused nowarn variable This patch fix the following warning: kernel/module.c: In function 'add_usage_links': kernel/module.c:1653:6: warning: variable 'nowarn' set but not used [-Wunused-but-set-variable] [jeyu: folded in first patch since it only swapped the function order so that del_usage_links can be called from add_usage_links] Signed-off-by: Corentin Labbe Signed-off-by: Jessica Yu --- kernel/module.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3803449ca219..f546d574f436 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1666,31 +1666,36 @@ static inline void remove_notes_attrs(struct module *mod) } #endif /* CONFIG_KALLSYMS */ -static void add_usage_links(struct module *mod) +static void del_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; - int nowarn; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - nowarn = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - } + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); mutex_unlock(&module_mutex); #endif } -static void del_usage_links(struct module *mod) +static int add_usage_links(struct module *mod) { + int ret = 0; #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); #endif + return ret; } static int module_add_modinfo_attrs(struct module *mod) @@ -1801,13 +1806,18 @@ static int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; - add_usage_links(mod); + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + add_sect_attrs(mod, info); add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod); out_unreg_param: module_param_sysfs_remove(mod); out_unreg_holders: -- cgit v1.2.3 From 673feb9d76ab3eddde7acfd94b206e321cfc90b9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Jun 2017 15:26:26 -0400 Subject: ftrace: Add :mod: caching infrastructure to trace_array This is the start of the infrastructure work to allow for tracing module functions before it is loaded. Currently the following command: # echo :mod:some-mod > set_ftrace_filter will enable tracing of all functions within the module "some-mod" if it is loaded. What we want, is if the module is not loaded, that line will be saved. When the module is loaded, then the "some-mod" will have that line executed on it, so that the functions within it starts being traced. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace.h | 12 +++++ 2 files changed, 148 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e5841dc14b5..1867edec6269 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1293,6 +1293,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) FTRACE_WARN_ON(hash->count); } +static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod) +{ + list_del(&ftrace_mod->list); + kfree(ftrace_mod->module); + kfree(ftrace_mod->func); + kfree(ftrace_mod); +} + +static void clear_ftrace_mod_list(struct list_head *head) +{ + struct ftrace_mod_load *p, *n; + + /* stack tracer isn't supported yet */ + if (!head) + return; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(p, n, head, list) + free_ftrace_mod(p); + mutex_unlock(&ftrace_lock); +} + static void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) @@ -1346,6 +1368,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return hash; } + +static int ftrace_add_mod(struct trace_array *tr, + const char *func, const char *module, + int enable) +{ + struct ftrace_mod_load *ftrace_mod; + struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace; + + ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL); + if (!ftrace_mod) + return -ENOMEM; + + ftrace_mod->func = kstrdup(func, GFP_KERNEL); + ftrace_mod->module = kstrdup(module, GFP_KERNEL); + ftrace_mod->enable = enable; + + if (!ftrace_mod->func || !ftrace_mod->module) + goto out_free; + + list_add(&ftrace_mod->list, mod_head); + + return 0; + + out_free: + free_ftrace_mod(ftrace_mod); + + return -ENOMEM; +} + static struct ftrace_hash * alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) { @@ -3457,6 +3508,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, { struct ftrace_iterator *iter; struct ftrace_hash *hash; + struct list_head *mod_head; + struct trace_array *tr = ops->private; int ret = 0; ftrace_ops_init(ops); @@ -3478,18 +3531,23 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, mutex_lock(&ops->func_hash->regex_lock); - if (flag & FTRACE_ITER_NOTRACE) + if (flag & FTRACE_ITER_NOTRACE) { hash = ops->func_hash->notrace_hash; - else + mod_head = tr ? &tr->mod_trace : NULL; + } else { hash = ops->func_hash->filter_hash; + mod_head = tr ? &tr->mod_notrace : NULL; + } if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; - if (file->f_flags & O_TRUNC) + if (file->f_flags & O_TRUNC) { iter->hash = alloc_ftrace_hash(size_bits); - else + clear_ftrace_mod_list(mod_head); + } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + } if (!iter->hash) { trace_parser_put(&iter->parser); @@ -3761,6 +3819,68 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, return ret; } +static bool module_exists(const char *module) +{ + /* All modules have the symbol __this_module */ + const char this_mod[] = "__this_module"; + const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; + char modname[modname_size + 1]; + unsigned long val; + int n; + + n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); + + if (n > modname_size) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + +static int cache_mod(struct trace_array *tr, + const char *func, char *module, int enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; + int ret; + + mutex_lock(&ftrace_lock); + + /* We do not cache inverse filters */ + if (func[0] == '!') { + func++; + ret = -EINVAL; + + /* Look to remove this hash */ + list_for_each_entry_safe(ftrace_mod, n, head, list) { + if (strcmp(ftrace_mod->module, module) != 0) + continue; + + /* no func matches all */ + if (!func || strcmp(func, "*") == 0 || + (ftrace_mod->func && + strcmp(ftrace_mod->func, func) == 0)) { + ret = 0; + free_ftrace_mod(ftrace_mod); + continue; + } + } + goto out; + } + + ret = -EINVAL; + /* We only care about modules that have not been loaded yet */ + if (module_exists(module)) + goto out; + + /* Save this string off, and execute it when the module is loaded */ + ret = ftrace_add_mod(tr, func, module, enable); + out: + mutex_unlock(&ftrace_lock); + + return ret; +} + /* * We register the module command as a template to show others how * to register the a command as well. @@ -3768,10 +3888,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, static int ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, - char *func, char *cmd, char *module, int enable) + char *func_orig, char *cmd, char *module, int enable) { + char *func; int ret; + /* match_records() modifies func, and we need the original */ + func = kstrdup(func_orig, GFP_KERNEL); + if (!func) + return -ENOMEM; + /* * cmd == 'mod' because we only registered this func * for the 'mod' ftrace_func_command. @@ -3780,8 +3906,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, * parameter. */ ret = match_records(hash, func, strlen(func), module); + kfree(func); + if (!ret) - return -EINVAL; + return cache_mod(tr, func_orig, module, enable); if (ret < 0) return ret; return 0; @@ -5570,6 +5698,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) void ftrace_init_trace_array(struct trace_array *tr) { INIT_LIST_HEAD(&tr->func_probes); + INIT_LIST_HEAD(&tr->mod_trace); + INIT_LIST_HEAD(&tr->mod_notrace); } #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 69a3ab3ee4f5..d63550cdbdfa 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -263,7 +263,10 @@ struct trace_array { struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; #ifdef CONFIG_DYNAMIC_FTRACE + /* All of these are protected by the ftrace_lock */ struct list_head func_probes; + struct list_head mod_trace; + struct list_head mod_notrace; #endif /* function tracing enabled */ int function_enabled; @@ -761,6 +764,15 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); +struct ftrace_hash; + +struct ftrace_mod_load { + struct list_head list; + char *func; + char *module; + int enable; +}; + struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; -- cgit v1.2.3 From 5985ea8bd5d1b820b909af49fbc2767a990080a6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Jun 2017 16:05:11 -0400 Subject: ftrace: Have the cached module list show in set_ftrace_filter When writing in a module filter into set_ftrace_filter for a module that is not yet loaded, it it cached, and will be executed when the module is loaded (although that is not implemented yet at this commit). Display the list of cached modules to be traced. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 112 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1867edec6269..bfdbce78064b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3112,6 +3112,7 @@ ftrace_allocate_pages(unsigned long num_to_init) struct ftrace_iterator { loff_t pos; loff_t func_pos; + loff_t mod_pos; struct ftrace_page *pg; struct dyn_ftrace *func; struct ftrace_func_probe *probe; @@ -3119,6 +3120,8 @@ struct ftrace_iterator { struct trace_parser parser; struct ftrace_hash *hash; struct ftrace_ops *ops; + struct trace_array *tr; + struct list_head *mod_list; int pidx; int idx; unsigned flags; @@ -3203,13 +3206,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos) if (!(iter->flags & FTRACE_ITER_DO_PROBES)) return NULL; - if (iter->func_pos > *pos) + if (iter->mod_pos > *pos) return NULL; iter->probe = NULL; iter->probe_entry = NULL; iter->pidx = 0; - for (l = 0; l <= (*pos - iter->func_pos); ) { + for (l = 0; l <= (*pos - iter->mod_pos); ) { p = t_probe_next(m, &l); if (!p) break; @@ -3247,6 +3250,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) return 0; } +static void * +t_mod_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + + (*pos)++; + iter->pos = *pos; + + iter->mod_list = iter->mod_list->next; + + if (iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) { + iter->flags &= ~FTRACE_ITER_MOD; + return NULL; + } + + iter->mod_pos = *pos; + + return iter; +} + +static void *t_mod_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + if (iter->func_pos > *pos) + return NULL; + + iter->mod_pos = iter->func_pos; + + /* probes are only available if tr is set */ + if (!iter->tr) + return NULL; + + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_mod_next(m, &l); + if (!p) + break; + } + if (!p) { + iter->flags &= ~FTRACE_ITER_MOD; + return t_probe_start(m, pos); + } + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_MOD; + + return iter; +} + +static int +t_mod_show(struct seq_file *m, struct ftrace_iterator *iter) +{ + struct ftrace_mod_load *ftrace_mod; + struct trace_array *tr = iter->tr; + + if (WARN_ON_ONCE(!iter->mod_list) || + iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) + return -EIO; + + ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list); + + if (ftrace_mod->func) + seq_printf(m, "%s", ftrace_mod->func); + else + seq_putc(m, '*'); + + seq_printf(m, ":mod:%s\n", ftrace_mod->module); + + return 0; +} + static void * t_func_next(struct seq_file *m, loff_t *pos) { @@ -3288,7 +3367,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - loff_t l = *pos; /* t_hash_start() must use original pos */ + loff_t l = *pos; /* t_probe_start() must use original pos */ void *ret; if (unlikely(ftrace_disabled)) @@ -3297,16 +3376,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (iter->flags & FTRACE_ITER_PROBE) return t_probe_next(m, pos); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_next(m, pos); + if (iter->flags & FTRACE_ITER_PRINTALL) { /* next must increment pos, and t_probe_start does not */ (*pos)++; - return t_probe_start(m, &l); + return t_mod_start(m, &l); } ret = t_func_next(m, pos); if (!ret) - return t_probe_start(m, &l); + return t_mod_start(m, &l); return ret; } @@ -3315,7 +3397,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -3344,15 +3426,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) ftrace_hash_empty(iter->hash)) { iter->func_pos = 1; /* Account for the message */ if (*pos > 0) - return t_probe_start(m, pos); + return t_mod_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; /* reset in case of seek/pread */ iter->flags &= ~FTRACE_ITER_PROBE; return iter; } - if (iter->flags & FTRACE_ITER_PROBE) - return t_probe_start(m, pos); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_start(m, pos); /* * Unfortunately, we need to restart at ftrace_pages_start @@ -3368,7 +3450,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) } if (!p) - return t_probe_start(m, pos); + return t_mod_start(m, pos); return iter; } @@ -3402,6 +3484,9 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_PROBE) return t_probe_show(m, iter); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_show(m, iter); + if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) seq_puts(m, "#### no functions disabled ####\n"); @@ -3528,17 +3613,20 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; + iter->tr = tr; mutex_lock(&ops->func_hash->regex_lock); if (flag & FTRACE_ITER_NOTRACE) { hash = ops->func_hash->notrace_hash; - mod_head = tr ? &tr->mod_trace : NULL; + mod_head = tr ? &tr->mod_notrace : NULL; } else { hash = ops->func_hash->filter_hash; - mod_head = tr ? &tr->mod_notrace : NULL; + mod_head = tr ? &tr->mod_trace : NULL; } + iter->mod_list = mod_head; + if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; -- cgit v1.2.3 From d7fbf8df7ca0a5c7e85db79f7005f99cb461c525 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 26 Jun 2017 10:57:21 -0400 Subject: ftrace: Implement cached modules tracing on module load If a module is cached in the set_ftrace_filter, and that module is loaded, then enable tracing on that module as if the cached module text was written into set_ftrace_filter just as the module is loaded. # echo ":mod:kvm_intel" > # cat /sys/kernel/tracing/set_ftrace_filter #### all functions enabled #### :mod:kvm_intel # modprobe kvm_intel # cat /sys/kernel/tracing/set_ftrace_filter vmx_get_rflags [kvm_intel] vmx_get_pkru [kvm_intel] vmx_get_interrupt_shadow [kvm_intel] vmx_rdtscp_supported [kvm_intel] vmx_invpcid_supported [kvm_intel] [..] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bfdbce78064b..f1ccf8be9df7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3969,6 +3969,97 @@ static int cache_mod(struct trace_array *tr, return ret; } +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable); + +static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, + char *mod, bool enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct ftrace_hash **orig_hash, *new_hash; + LIST_HEAD(process_mods); + char *func; + int ret; + + mutex_lock(&ops->func_hash->regex_lock); + + if (enable) + orig_hash = &ops->func_hash->filter_hash; + else + orig_hash = &ops->func_hash->notrace_hash; + + new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, + *orig_hash); + if (!new_hash) + return; /* Warn? */ + + mutex_lock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, head, list) { + + if (strcmp(ftrace_mod->module, mod) != 0) + continue; + + if (ftrace_mod->func) + func = kstrdup(ftrace_mod->func, GFP_KERNEL); + else + func = kstrdup("*", GFP_KERNEL); + + if (!func) /* warn? */ + continue; + + list_del(&ftrace_mod->list); + list_add(&ftrace_mod->list, &process_mods); + + /* Use the newly allocated func, as it may be "*" */ + kfree(ftrace_mod->func); + ftrace_mod->func = func; + } + + mutex_unlock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) { + + func = ftrace_mod->func; + + /* Grabs ftrace_lock, which is why we have this extra step */ + match_records(new_hash, func, strlen(func), mod); + free_ftrace_mod(ftrace_mod); + } + + mutex_lock(&ftrace_lock); + + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, + new_hash, enable); + mutex_unlock(&ftrace_lock); + + mutex_unlock(&ops->func_hash->regex_lock); + + free_ftrace_hash(new_hash); +} + +static void process_cached_mods(const char *mod_name) +{ + struct trace_array *tr; + char *mod; + + mod = kstrdup(mod_name, GFP_KERNEL); + if (!mod) + return; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!list_empty(&tr->mod_trace)) + process_mod_list(&tr->mod_trace, tr->ops, mod, true); + if (!list_empty(&tr->mod_notrace)) + process_mod_list(&tr->mod_notrace, tr->ops, mod, false); + } + mutex_unlock(&trace_types_lock); + + kfree(mod); +} + /* * We register the module command as a template to show others how * to register the a command as well. @@ -5682,6 +5773,8 @@ void ftrace_module_enable(struct module *mod) out_unlock: mutex_unlock(&ftrace_lock); + + process_cached_mods(mod->name); } void ftrace_module_init(struct module *mod) -- cgit v1.2.3 From 8c08f0d5c6fb10ff93ffb1cbf416f4f1c3a52a80 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 26 Jun 2017 11:47:31 -0400 Subject: ftrace: Have cached module filters be an active filter When a module filter is added to set_ftrace_filter, if the module is not loaded, it is cached. This should be considered an active filter, and function tracing should be filtered by this. That is, if a cached module filter is the only filter set, then no function tracing should be happening, as all the functions available will be filtered out. This makes sense, as the reason to add a cached module filter, is to trace the module when you load it. There shouldn't be any other tracing happening until then. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 20 +++++++++++++++----- kernel/trace/trace.h | 7 ++++++- 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1ccf8be9df7..914539e3e301 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1410,6 +1410,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) if (!new_hash) return NULL; + if (hash) + new_hash->flags = hash->flags; + /* Empty hash? */ if (ftrace_hash_empty(hash)) return new_hash; @@ -1454,7 +1457,7 @@ __ftrace_hash_move(struct ftrace_hash *src) /* * If the new source is empty, just return the empty_hash. */ - if (!src->count) + if (ftrace_hash_empty(src)) return EMPTY_HASH; /* @@ -1471,6 +1474,8 @@ __ftrace_hash_move(struct ftrace_hash *src) if (!new_hash) return NULL; + new_hash->flags = src->flags; + size = 1 << src->size_bits; for (i = 0; i < size; i++) { hhd = &src->buckets[i]; @@ -1701,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, struct dyn_ftrace *rec; bool update = false; int count = 0; - int all = 0; + int all = false; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) @@ -1722,7 +1727,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, hash = ops->func_hash->filter_hash; other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) - all = 1; + all = true; } else { inc = !inc; hash = ops->func_hash->notrace_hash; @@ -4028,6 +4033,9 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, free_ftrace_mod(ftrace_mod); } + if (enable && list_empty(head)) + new_hash->flags &= ~FTRACE_HASH_FL_MOD; + mutex_lock(&ftrace_lock); ret = ftrace_hash_move_and_update_ops(ops, orig_hash, @@ -5035,9 +5043,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); - if (filter_hash) + if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - else + if (!list_empty(&iter->tr->mod_trace)) + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } else orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d63550cdbdfa..13823951e42b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -773,10 +773,15 @@ struct ftrace_mod_load { int enable; }; +enum { + FTRACE_HASH_FL_MOD = (1 << 0), +}; + struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; unsigned long count; + unsigned long flags; struct rcu_head rcu; }; @@ -785,7 +790,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) { - return !hash || !hash->count; + return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); } /* Standard output formatting function used for function return traces */ -- cgit v1.2.3 From 49368a47f6dc1b256e3b83813da5c9b0731fe268 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sat, 3 Jun 2017 20:52:32 +1000 Subject: PM / hibernate: Use CONFIG_HAVE_SET_MEMORY for include condition Kbuild reported a build failure when CONFIG_STRICT_KERNEL_RWX was enabled on powerpc. We don't yet have ARCH_HAS_SET_MEMORY and ppc32 saw a build failure. I've only done a basic compile test with a config that has hibernation enabled. Fixes: 50327ddfbc92 (kernel/power/snapshot.c: use set_memory.h header) Reported-by: Christophe Leroy Signed-off-by: Balbir Singh Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fa46606f3356..71730d672290 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -36,13 +36,13 @@ #include #include #include -#ifdef CONFIG_STRICT_KERNEL_RWX +#ifdef CONFIG_ARCH_HAS_SET_MEMORY #include #endif #include "power.h" -#ifdef CONFIG_STRICT_KERNEL_RWX +#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY) static bool hibernate_restore_protection; static bool hibernate_restore_protection_active; @@ -77,7 +77,7 @@ static inline void hibernate_restore_protection_begin(void) {} static inline void hibernate_restore_protection_end(void) {} static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} -#endif /* CONFIG_STRICT_KERNEL_RWX */ +#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); -- cgit v1.2.3 From eba74c294467d55c697e2199c37dfaf8126fe396 Mon Sep 17 00:00:00 2001 From: BaoJun Luo Date: Tue, 27 Jun 2017 02:10:44 +0200 Subject: PM / hibernate: Drop redundant parameter of swsusp_alloc() The first parameter of swsusp_alloc is not used, so drop it. Signed-off-by: BaoJun Luo [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 71730d672290..b7708e319941 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1929,8 +1929,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, * also be located in the high memory, because of the way in which * copy_data_pages() works. */ -static int swsusp_alloc(struct memory_bitmap *orig_bm, - struct memory_bitmap *copy_bm, +static int swsusp_alloc(struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { if (nr_highmem > 0) { @@ -1976,7 +1975,7 @@ asmlinkage __visible int swsusp_save(void) return -ENOMEM; } - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { printk(KERN_ERR "PM: Memory allocation failed\n"); return -ENOMEM; } -- cgit v1.2.3 From 6a9c981b1e9657ca5866d10aa38b8a4fe1159138 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Jun 2017 11:02:49 -0400 Subject: ftrace: Remove unused function ftrace_arch_read_dyn_info() ftrace_arch_read_dyn_info() was used so that archs could add its own debug information into the dyn_ftrace_total_info in the tracefs file system. That file is for debugging usage of dynamic ftrace. No arch uses that function anymore, so just get rid of it. This also allows for tracing_read_dyn_info() to be cleaned up a bit. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 19ac2088d10a..14318ce92b13 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6737,33 +6737,18 @@ static const struct file_operations tracing_stats_fops = { #ifdef CONFIG_DYNAMIC_FTRACE -int __weak ftrace_arch_read_dyn_info(char *buf, int size) -{ - return 0; -} - static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - static char ftrace_dyn_info_buffer[1024]; - static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; - char *buf = ftrace_dyn_info_buffer; - int size = ARRAY_SIZE(ftrace_dyn_info_buffer); + char buf[64]; /* Not too big for a shallow stack */ int r; - mutex_lock(&dyn_info_mutex); - r = sprintf(buf, "%ld ", *p); - - r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); + r = scnprintf(buf, 63, "%ld", *p); buf[r++] = '\n'; - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - mutex_unlock(&dyn_info_mutex); - - return r; + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static const struct file_operations tracing_dyn_info_fops = { -- cgit v1.2.3 From 83dd14933e33a45e9b366c572e15505982b46845 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Jun 2017 11:04:40 -0400 Subject: ftrace: Decrement count for dyn_ftrace_total_info file The dyn_ftrace_total_info file is used to show how many functions have been converted into nops and can be used by ftrace. The problem is that it does not get decremented when functions are removed (init boot code being freed, and modules being freed). That means the number is very inaccurate everytime functions are removed from the ftrace tables. Decrement it when functions are removed. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 914539e3e301..7509ef9810bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5705,6 +5705,7 @@ void ftrace_release_mod(struct module *mod) if (pg == ftrace_pages) ftrace_pages = next_to_ftrace_page(last_pg); + ftrace_update_tot_cnt -= pg->index; *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); -- cgit v1.2.3 From d914ba37d7145acb9fd3bb23075c2d56e5a44eb6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jun 2017 19:01:55 -0700 Subject: tracing: Add support for recording tgid of tasks Inorder to support recording of tgid, the following changes are made: * Introduce a new API (tracing_record_taskinfo) to additionally record the tgid along with the task's comm at the same time. This has has the benefit of not setting trace_cmdline_save before all the information for a task is saved. * Add a new API tracing_record_taskinfo_sched_switch to record task information for 2 tasks at a time (previous and next) and use it from sched_switch probe. * Preserve the old API (tracing_record_cmdline) and create it as a wrapper around the new one so that existing callers aren't affected. * Reuse the existing sched_switch and sched_wakeup probes to record tgid information and add a new option 'record-tgid' to enable recording of tgid When record-tgid option isn't enabled to being with, we take care to make sure that there's isn't memory or runtime overhead. Link: http://lkml.kernel.org/r/20170627020155.5139-1-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Tested-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 105 ++++++++++++++++++++++++++++++++++---- kernel/trace/trace.h | 7 +++ kernel/trace/trace_events.c | 42 ++++++++++++++- kernel/trace/trace_sched_switch.c | 72 +++++++++++++++++++++----- 4 files changed, 201 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 14318ce92b13..ab9db750dd29 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_cmdline_save); +static DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { - __this_cpu_write(trace_cmdline_save, true); + __this_cpu_write(trace_taskinfo_save, true); /* If this is the temp buffer, we need to commit fully */ if (this_cpu_read(trace_buffered_event) == event) { @@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void) } } +static int *tgid_map; + #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer { static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ -static atomic_t trace_record_cmdline_disabled __read_mostly; +static atomic_t trace_record_taskinfo_disabled __read_mostly; static inline char *get_saved_cmdlines(int idx) { @@ -1990,16 +1992,87 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } -void tracing_record_cmdline(struct task_struct *tsk) +int trace_find_tgid(int pid) +{ + if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT)) + return 0; + + return tgid_map[pid]; +} + +static int trace_save_tgid(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) + if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT)) + return 0; + + tgid_map[tsk->pid] = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task - task to record + * @flags - TRACE_RECORD_CMDLINE for recording comm + * - TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + if (tracing_record_taskinfo_skip(flags)) + return; + if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task)) + return; + if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task)) return; - if (!__this_cpu_read(trace_cmdline_save)) + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev - previous task during sched_switch + * @next - next task during sched_switch + * @flags - TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) +{ + if (tracing_record_taskinfo_skip(flags)) return; - if (trace_save_cmdline(tsk)) - __this_cpu_write(trace_cmdline_save, false); + if ((flags & TRACE_RECORD_CMDLINE) && + (!trace_save_cmdline(prev) || !trace_save_cmdline(next))) + return; + + if ((flags & TRACE_RECORD_TGID) && + (!trace_save_tgid(prev) || !trace_save_tgid(next))) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); +} + +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); } /* @@ -3144,7 +3217,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) #endif if (!iter->snapshot) - atomic_inc(&trace_record_cmdline_disabled); + atomic_inc(&trace_record_taskinfo_disabled); if (*pos != iter->pos) { iter->ent = NULL; @@ -3189,7 +3262,7 @@ static void s_stop(struct seq_file *m, void *p) #endif if (!iter->snapshot) - atomic_dec(&trace_record_cmdline_disabled); + atomic_dec(&trace_record_taskinfo_disabled); trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); @@ -4236,6 +4309,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + if (mask == TRACE_ITER_RECORD_TGID) { + if (!tgid_map) + tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map), + GFP_KERNEL); + if (!tgid_map) { + tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + return -ENOMEM; + } + + trace_event_enable_tgid_record(enabled); + } + if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13823951e42b..6ade1c55cc3a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -640,6 +640,9 @@ void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); +void tracing_start_tgid_record(void); +void tracing_stop_tgid_record(void); + int register_tracer(struct tracer *type); int is_tracing_stopped(void); @@ -700,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE @@ -1124,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ C(LATENCY_FMT, "latency-format"), \ C(RECORD_CMD, "record-cmd"), \ + C(RECORD_TGID, "record-tgid"), \ C(OVERWRITE, "overwrite"), \ C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ @@ -1440,6 +1445,8 @@ struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); +extern void trace_event_enable_tgid_record(bool enable); + extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 83dfd0dbbbfe..36132f9280e6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable) mutex_unlock(&event_mutex); } +void trace_event_enable_tgid_record(bool enable) +{ + struct trace_event_file *file; + struct trace_array *tr; + + mutex_lock(&event_mutex); + do_for_each_event_file(tr, file) { + if (!(file->flags & EVENT_FILE_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } else { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, + &file->flags); + } + } while_for_each_event_file(); + mutex_unlock(&event_mutex); +} + static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { @@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } + + if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + } + call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ @@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + bool cmd = false, tgid = false; /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } + + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + tgid = true; + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } + ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { - tracing_stop_cmdline_record(); + if (cmd) + tracing_stop_cmdline_record(); + if (tgid) + tracing_stop_tgid_record(); pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4c896a0101bd..b341c02730be 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -12,27 +12,38 @@ #include "trace.h" -static int sched_ref; +#define RECORD_CMDLINE 1 +#define RECORD_TGID 2 + +static int sched_cmdline_ref; +static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { - if (unlikely(!sched_ref)) - return; + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); - tracing_record_cmdline(prev); - tracing_record_cmdline(next); + if (!flags) + return; + tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { - if (unlikely(!sched_ref)) - return; + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); - tracing_record_cmdline(current); + if (!flags) + return; + tracing_record_taskinfo(current, flags); } static int tracing_sched_register(void) @@ -75,28 +86,61 @@ static void tracing_sched_unregister(void) unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } -static void tracing_start_sched_switch(void) +static void tracing_start_sched_switch(int ops) { + bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref); mutex_lock(&sched_register_mutex); - if (!(sched_ref++)) + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref++; + break; + + case RECORD_TGID: + sched_tgid_ref++; + break; + } + + if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } -static void tracing_stop_sched_switch(void) +static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); - if (!(--sched_ref)) + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref--; + break; + + case RECORD_TGID: + sched_tgid_ref--; + break; + } + + if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { - tracing_start_sched_switch(); + tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { - tracing_stop_sched_switch(); + tracing_stop_sched_switch(RECORD_CMDLINE); +} + +void tracing_start_tgid_record(void) +{ + tracing_start_sched_switch(RECORD_TGID); +} + +void tracing_stop_tgid_record(void) +{ + tracing_stop_sched_switch(RECORD_TGID); } -- cgit v1.2.3 From 441dae8f2f2975c68101a84bc3f528ec95ecf7c3 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 25 Jun 2017 22:38:43 -0700 Subject: tracing: Add support for display of tgid in trace output Earlier patches introduced ability to record the tgid using the 'record-tgid' option. Here we read the tgid and output it if the option is enabled. Link: http://lkml.kernel.org/r/20170626053844.5746-3-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Tested-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 36 ++++++++++++++++++++++-------------- kernel/trace/trace_output.c | 9 +++++++++ 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ab9db750dd29..c579dea4a0eb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3319,23 +3319,29 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, + unsigned int flags) { + bool tgid = flags & TRACE_ITER_RECORD_TGID; + print_event_info(buf, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" - "# | | | | |\n"); + + seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); + seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); } -static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, + unsigned int flags) { - print_event_info(buf, m); - seq_puts(m, "# _-----=> irqs-off\n" - "# / _----=> need-resched\n" - "# | / _---=> hardirq/softirq\n" - "# || / _--=> preempt-depth\n" - "# ||| / delay\n" - "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" - "# | | | |||| | |\n"); + bool tgid = flags & TRACE_ITER_RECORD_TGID; + + seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : ""); + seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : ""); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : ""); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : ""); + seq_printf(m, "# %s||| / delay\n", tgid ? " " : ""); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : ""); } void @@ -3651,9 +3657,11 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->trace_buffer, m); + print_func_help_header_irq(iter->trace_buffer, + m, trace_flags); else - print_func_help_header(iter->trace_buffer, m); + print_func_help_header(iter->trace_buffer, m, + trace_flags); } } } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 01ff99969ca7..bac629af2285 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -597,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + unsigned int tgid = trace_find_tgid(entry->pid); + + if (!tgid) + trace_seq_printf(s, "(-----) "); + else + trace_seq_printf(s, "(%5d) ", tgid); + } + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); -- cgit v1.2.3 From 93437353daeff31bd5b11810daa4d2d509d1a64e Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 26 May 2017 14:12:25 -0700 Subject: module: use list_for_each_entry_rcu() on find_module_all() The module list has been using RCU in a lot of other calls for a while now, we just overlooked changing this one over to use RCU. Signed-off-by: Luis R. Rodriguez Signed-off-by: Jessica Yu --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f546d574f436..afc6ede7bcdf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -603,7 +603,7 @@ static struct module *find_module_all(const char *name, size_t len, module_assert_mutex_or_preempt(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) -- cgit v1.2.3 From 165d1cc0074b2f938586274776d029b9bce914c4 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 23 Jun 2017 12:19:12 -0700 Subject: kmod: reduce atomic operations on kmod_concurrent and simplify When checking if we want to allow a kmod thread to kick off we increment, then read to see if we should enable a thread. If we were over the allowed limit limit we decrement. Splitting the increment far apart from decrement means there could be a time where two increments happen potentially giving a false failure on a thread which should have been allowed. CPU1 CPU2 atomic_inc() atomic_inc() atomic_read() atomic_read() atomic_dec() atomic_dec() In this case a read on CPU1 gets the atomic_inc()'s and we could negate it from getting a kmod thread. We could try to prevent this with a lock or preemption but that is overkill. We can fix by reducing the number of atomic operations. We do this by inverting the logic of of the enabler, instead of incrementing kmod_concurrent as we get new kmod users, define the variable kmod_concurrent_max as the max number of currently allowed kmod users and as we get new kmod users just decrement it if its still positive. This combines the dec and read in one atomic operation. In this case we no longer get the same false failure: CPU1 CPU2 atomic_dec_if_positive() atomic_dec_if_positive() atomic_inc() atomic_inc() The number of threads is computed at init, and since the current computation of kmod_concurrent includes the thread count we can avoid setting kmod_concurrent_max later in boot through an init call by simply sticking to 50 as the kmod_concurrent_max. The assumption here is a system with modules must at least have ~16 MiB of RAM. Suggested-by: Petr Mladek Suggested-by: Dmitry Torokhov Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Signed-off-by: Jessica Yu --- kernel/kmod.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e2be36..ff68198fe83b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,8 +45,6 @@ #include -extern int max_threads; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -56,6 +54,20 @@ static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES +/* + * Assuming: + * + * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + * (u64) THREAD_SIZE * 8UL); + * + * If you need less than 50 threads would mean we're dealing with systems + * smaller than 3200 pages. This assuems you are capable of having ~13M memory, + * and this would only be an be an upper limit, after which the OOM killer + * would take effect. Systems like these are very unlikely if modules are + * enabled. + */ +#define MAX_KMOD_CONCURRENT 50 +static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); /* modprobe_path is set via /proc/sys. @@ -127,10 +139,7 @@ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; - unsigned int max_modprobes; int ret; - static atomic_t kmod_concurrent = ATOMIC_INIT(0); -#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; /* @@ -154,21 +163,7 @@ int __request_module(bool wait, const char *fmt, ...) if (ret) return ret; - /* If modprobe needs a service that is in a module, we get a recursive - * loop. Limit the number of running kmod threads to max_threads/2 or - * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method - * would be to run the parents of this process, counting how many times - * kmod was invoked. That would mean accessing the internals of the - * process tables to get the command line, proc_pid_cmdline is static - * and it is not worth changing the proc code just to handle this case. - * KAO. - * - * "trace the ppid" is simple, but will fail if someone's - * parent exits. I think this is as good as it gets. --RR - */ - max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); - atomic_inc(&kmod_concurrent); - if (atomic_read(&kmod_concurrent) > max_modprobes) { + if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { /* We may be blaming an innocent here, but unlikely */ if (kmod_loop_msg < 5) { printk(KERN_ERR @@ -176,7 +171,6 @@ int __request_module(bool wait, const char *fmt, ...) module_name); kmod_loop_msg++; } - atomic_dec(&kmod_concurrent); return -ENOMEM; } @@ -184,10 +178,12 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); - atomic_dec(&kmod_concurrent); + atomic_inc(&kmod_concurrent_max); + return ret; } EXPORT_SYMBOL(__request_module); + #endif /* CONFIG_MODULES */ static void call_usermodehelper_freeinfo(struct subprocess_info *info) -- cgit v1.2.3 From 3b58a3c72f484393c65995a551902945f5a18c70 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Jun 2017 09:09:38 -0400 Subject: ftrace: Unlock hash mutex on failed allocation in process_mod_list() If the new_hash fails to allocate, then unlock the hash mutex on error. Reported-by: Julia Lawall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7509ef9810bf..2c79630cd267 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3997,7 +3997,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!new_hash) - return; /* Warn? */ + goto out; /* warn? */ mutex_lock(&ftrace_lock); @@ -4042,6 +4042,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, new_hash, enable); mutex_unlock(&ftrace_lock); + out: mutex_unlock(&ops->func_hash->regex_lock); free_ftrace_hash(new_hash); -- cgit v1.2.3 From 4ec78467858739c0119569c0610676aa50dfa8fb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Jun 2017 11:57:03 -0400 Subject: ftrace: Decrement count for dyn_ftrace_total_info for init functions Init boot up functions may be traced, but they are also freed when the kernel finishes booting. These are removed from the ftrace tables, and the debug variable for dyn_ftrace_total_info needs to reflect that as well. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c79630cd267..e392f750a1cf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5825,6 +5825,7 @@ void __init ftrace_free_init_mem(void) if (!rec) continue; pg->index--; + ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); -- cgit v1.2.3 From 824ecbe01c5d833b8c8a371c209e3ac3a76cd18a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 25 Jun 2017 00:27:59 -0400 Subject: cgroup: restructure cgroup_procs_write_permission() Restructure cgroup_procs_write_permission() to make extending permission logic easier. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 57 +++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dbfd7028b1c6..d48069ee84c2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2363,27 +2363,12 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) { - int ret = 0; - - if (cgroup_on_dfl(dst_cgrp)) { - struct super_block *sb = of->file->f_path.dentry->d_sb; - struct cgroup *cgrp; - struct inode *inode; - - spin_lock_irq(&css_set_lock); - cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - while (!cgroup_is_descendant(dst_cgrp, cgrp)) - cgrp = cgroup_parent(cgrp); + struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup *src_cgrp, *com_cgrp; + struct inode *inode; + int ret; - ret = -ENOMEM; - inode = kernfs_get_inode(sb, cgrp->procs_file.kn); - if (inode) { - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - } - } else { + if (!cgroup_on_dfl(dst_cgrp)) { const struct cred *cred = current_cred(); const struct cred *tcred = get_task_cred(task); @@ -2391,14 +2376,38 @@ static int cgroup_procs_write_permission(struct task_struct *task, * even if we're attaching all tasks in the thread group, * we only need to check permissions on one of them. */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->euid, tcred->suid)) + ret = 0; + else ret = -EACCES; + put_cred(tcred); + return ret; } - return ret; + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + /* and the common ancestor */ + com_cgrp = src_cgrp; + while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) + com_cgrp = cgroup_parent(com_cgrp); + + /* %current should be authorized to migrate to the common ancestor */ + inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + if (ret) + return ret; + + return 0; } /* -- cgit v1.2.3 From 5136f6365ce3eace5a926e10f16ed2a233db5ba9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Jun 2017 14:30:28 -0400 Subject: cgroup: implement "nsdelegate" mount option Currently, cgroup only supports delegation to !root users and cgroup namespaces don't get any special treatments. This limits the usefulness of cgroup namespaces as they by themselves can't be safe delegation boundaries. A process inside a cgroup can change the resource control knobs of the parent in the namespace root and may move processes in and out of the namespace if cgroups outside its namespace are visible somehow. This patch adds a new mount option "nsdelegate" which makes cgroup namespaces delegation boundaries. If set, cgroup behaves as if write permission based delegation took place at namespace boundaries - writes to the resource control knobs from the namespace root are denied and migration crossing the namespace boundary aren't allowed from inside the namespace. This allows cgroup namespace to function as a delegation boundary by itself. v2: Silently ignore nsdelegate specified on !init mounts. Signed-off-by: Tejun Heo Cc: Aravind Anbudurai Cc: Serge Hallyn Cc: Eric Biederman --- kernel/cgroup/cgroup.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d48069ee84c2..620794a20a33 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1547,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } +static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) +{ + char *token; + + *root_flags = 0; + + if (!data) + return 0; + + while ((token = strsep(&data, ",")) != NULL) { + if (!strcmp(token, "nsdelegate")) { + *root_flags |= CGRP_ROOT_NS_DELEGATE; + continue; + } + + pr_err("cgroup2: unknown option \"%s\"\n", token); + return -EINVAL; + } + + return 0; +} + +static void apply_cgroup_root_flags(unsigned int root_flags) +{ + if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { + if (root_flags & CGRP_ROOT_NS_DELEGATE) + cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + } +} + +static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) + seq_puts(seq, ",nsdelegate"); + return 0; +} + static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - pr_err("remount is not allowed\n"); - return -EINVAL; + unsigned int root_flags; + int ret; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) + return ret; + + apply_cgroup_root_flags(root_flags); + return 0; } /* @@ -1790,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct dentry *dentry; + int ret; get_cgroup_ns(ns); @@ -1807,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, cgroup_enable_task_cg_lists(); if (fs_type == &cgroup2_fs_type) { - if (data) { - pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + unsigned int root_flags; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) { put_cgroup_ns(ns); - return ERR_PTR(-EINVAL); + return ERR_PTR(ret); } + cgrp_dfl_visible = true; cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); + if (!IS_ERR(dentry)) + apply_cgroup_root_flags(root_flags); } else { dentry = cgroup1_mount(&cgroup_fs_type, flags, data, CGROUP_SUPER_MAGIC, ns); @@ -2364,6 +2416,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct kernfs_open_file *of) { struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; struct cgroup *src_cgrp, *com_cgrp; struct inode *inode; int ret; @@ -2407,6 +2461,15 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (ret) return ret; + /* + * If namespaces are delegation boundaries, %current must be able + * to see both source and destination cgroups from its namespace. + */ + if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && + (!cgroup_is_descendant(src_cgrp, root_cgrp) || + !cgroup_is_descendant(dst_cgrp, root_cgrp))) + return -ENOENT; + return 0; } @@ -2971,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of) static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; int ret; + /* + * If namespaces are delegation boundaries, disallow writes to + * files in an non-init namespace root from inside the namespace + * except for the files explicitly marked delegatable - + * cgroup.procs and cgroup.subtree_control. + */ + if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && + !(cft->flags & CFTYPE_NS_DELEGATABLE) && + ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + return -EPERM; + if (cft->write) return cft->write(of, buf, nbytes, off); @@ -3809,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", + .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, @@ -3822,6 +3898,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.subtree_control", + .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, @@ -4410,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn) } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .show_options = cgroup_show_options, .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, -- cgit v1.2.3 From 2287d8664fe7345ead891017eccd879fc605305e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 27 Jun 2017 18:15:38 +0200 Subject: timers: Make the cpu base lock raw The timers cpu base lock could not be converted to a raw spinlock becaue the lock held time was non-deterministic due to cascading and long lasting timer wheel traversals. The rework of the timer wheel to the new non-cascading model removed also the wheel traversals and the lock held times are deterministic now. This allows to make the lock raw and thereby unbreaks NOHz* on preempt-RT. Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20170627161538.30257-1-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 709a404bd133..71ce3f4eead3 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -195,7 +195,7 @@ EXPORT_SYMBOL(jiffies_64); #endif struct timer_base { - spinlock_t lock; + raw_spinlock_t lock; struct timer_list *running_timer; unsigned long clk; unsigned long next_expiry; @@ -913,10 +913,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); - spin_lock_irqsave(&base->lock, *flags); + raw_spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; - spin_unlock_irqrestore(&base->lock, *flags); + raw_spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); } @@ -986,9 +986,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) /* See the comment in lock_timer_base() */ timer->flags |= TIMER_MIGRATING; - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); base = new_base; - spin_lock(&base->lock); + raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); } @@ -1013,7 +1013,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } out_unlock: - spin_unlock_irqrestore(&base->lock, flags); + raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -1106,16 +1106,16 @@ void add_timer_on(struct timer_list *timer, int cpu) if (base != new_base) { timer->flags |= TIMER_MIGRATING; - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); base = new_base; - spin_lock(&base->lock); + raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } debug_activate(timer, timer->expires); internal_add_timer(base, timer); - spin_unlock_irqrestore(&base->lock, flags); + raw_spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1141,7 +1141,7 @@ int del_timer(struct timer_list *timer) if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); - spin_unlock_irqrestore(&base->lock, flags); + raw_spin_unlock_irqrestore(&base->lock, flags); } return ret; @@ -1168,7 +1168,7 @@ int try_to_del_timer_sync(struct timer_list *timer) if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); - spin_unlock_irqrestore(&base->lock, flags); + raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -1299,13 +1299,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) data = timer->data; if (timer->flags & TIMER_IRQSAFE) { - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, data); - spin_lock(&base->lock); + raw_spin_lock(&base->lock); } else { - spin_unlock_irq(&base->lock); + raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); + raw_spin_lock_irq(&base->lock); } } } @@ -1474,7 +1474,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (cpu_is_offline(smp_processor_id())) return expires; - spin_lock(&base->lock); + raw_spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; @@ -1502,7 +1502,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if ((expires - basem) > TICK_NSEC) base->is_idle = true; } - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); } @@ -1590,7 +1590,7 @@ static inline void __run_timers(struct timer_base *base) if (!time_after_eq(jiffies, base->clk)) return; - spin_lock_irq(&base->lock); + raw_spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->clk)) { @@ -1601,7 +1601,7 @@ static inline void __run_timers(struct timer_base *base) expire_timers(base, heads + levels); } base->running_timer = NULL; - spin_unlock_irq(&base->lock); + raw_spin_unlock_irq(&base->lock); } /* @@ -1786,16 +1786,16 @@ int timers_dead_cpu(unsigned int cpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); for (i = 0; i < WHEEL_SIZE; i++) migrate_timer_list(new_base, old_base->vectors + i); - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + raw_spin_unlock(&old_base->lock); + raw_spin_unlock_irq(&new_base->lock); put_cpu_ptr(&timer_bases); } return 0; @@ -1811,7 +1811,7 @@ static void __init init_timer_cpu(int cpu) for (i = 0; i < NR_BASES; i++) { base = per_cpu_ptr(&timer_bases[i], cpu); base->cpu = cpu; - spin_lock_init(&base->lock); + raw_spin_lock_init(&base->lock); base->clk = jiffies; } } -- cgit v1.2.3 From ff801b716effd652f420204eddb36f6e4a716819 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 08:25:52 +0200 Subject: sched/numa: Hide numa_wake_affine() from UP build Stephen reported the following build warning in UP: kernel/sched/fair.c:2657:9: warning: 'struct sched_domain' declared inside parameter list ^ /home/sfr/next/next/kernel/sched/fair.c:2657:9: warning: its scope is only this definition or declaration, which is probably not what you want Hide the numa_wake_affine() inline stub on UP builds to get rid of it. Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner Cc: Rik van Riel Cc: Peter Zijlstra --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6f4f155adf5f..008c514dc241 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2652,12 +2652,14 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +#ifdef CONFIG_SMP static inline bool numa_wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { return true; } +#endif /* !SMP */ #endif /* CONFIG_NUMA_BALANCING */ static void -- cgit v1.2.3 From 96b5b19459b3c2aed2872bac42cbe19edfae710f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 28 Jun 2017 18:32:31 -0700 Subject: module: make the modinfo name const This can be accomplished by making blacklisted() also accept const. Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook [jeyu: fix typo] Signed-off-by: Jessica Yu --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index afc6ede7bcdf..d07287707557 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -302,7 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { - char *name; + const char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -3265,7 +3265,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, /* module_blacklist is a comma-separated list of module names */ static char *module_blacklist; -static bool blacklisted(char *module_name) +static bool blacklisted(const char *module_name) { const char *p; size_t len; -- cgit v1.2.3 From 0f17976568b3f72e676450af0c0db6f8752253d6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Jun 2017 10:05:45 -0400 Subject: ftrace: Fix regression with module command in stack_trace_filter When doing the following command: # echo ":mod:kvm_intel" > /sys/kernel/tracing/stack_trace_filter it triggered a crash. This happened with the clean up of probes. It required all callers to the regex function (doing ftrace filtering) to have ops->private be a pointer to a trace_array. But for the stack tracer, that is not the case. Allow for the ops->private to be NULL, and change the function command callbacks to handle the trace_array pointer being NULL as well. Fixes: d2afd57a4b96 ("tracing/ftrace: Allow instances to have their own function probes") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_functions.c | 12 ++++++++++++ kernel/trace/trace_stack.c | 6 ++++-- 4 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e5841dc14b5..b308be30dfb9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4337,9 +4337,6 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, command = strsep(&next, ":"); - if (WARN_ON_ONCE(!tr)) - return -EINVAL; - mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1122f151466f..091e801145c9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6881,6 +6881,9 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *number; int ret; + if (!tr) + return -ENODEV; + /* hash funcs only work with set_ftrace_filter */ if (!enable) return -EINVAL; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a3bddbfd0874..a0910c0cdf2e 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -654,6 +654,9 @@ ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; @@ -670,6 +673,9 @@ ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, @@ -682,6 +688,9 @@ ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = &dump_probe_ops; /* Only dump once. */ @@ -695,6 +704,9 @@ ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, { struct ftrace_probe_ops *ops; + if (!tr) + return -ENODEV; + ops = &cpudump_probe_ops; /* Only dump once. */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 76aa04d4c925..b4a751e8f9d6 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -409,7 +409,9 @@ static const struct file_operations stack_trace_fops = { static int stack_trace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_FILTER, inode, file); } @@ -476,7 +478,7 @@ static __init int stack_trace_init(void) NULL, &stack_trace_fops); trace_create_file("stack_trace_filter", 0444, d_tracer, - NULL, &stack_trace_filter_fops); + &trace_ops, &stack_trace_filter_fops); if (stack_trace_filter_buf[0]) ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); -- cgit v1.2.3 From 14dc6f04f49dc12614d7e90928b495b8d73cd471 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 27 Jun 2017 23:08:34 -0700 Subject: bpf: Add syscall lookup support for fd array and htab This patch allows userspace to do BPF_MAP_LOOKUP_ELEM on BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_ARRAY_OF_MAPS and BPF_MAP_TYPE_HASH_OF_MAPS. The lookup returns a prog-id or map-id to the userspace. The userspace can then use the BPF_PROG_GET_FD_BY_ID or BPF_MAP_GET_FD_BY_ID to get a fd. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 27 +++++++++++++++++++++++++++ kernel/bpf/hashtab.c | 21 +++++++++++++++++++++ kernel/bpf/map_in_map.c | 5 +++++ kernel/bpf/map_in_map.h | 1 + kernel/bpf/syscall.c | 16 +++++++++++++--- 5 files changed, 67 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ecb43542246e..d771a3872500 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -334,6 +334,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* only called from syscall */ +int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +{ + void **elem, *ptr; + int ret = 0; + + if (!map->ops->map_fd_sys_lookup_elem) + return -ENOTSUPP; + + rcu_read_lock(); + elem = array_map_lookup_elem(map, key); + if (elem && (ptr = READ_ONCE(*elem))) + *value = map->ops->map_fd_sys_lookup_elem(ptr); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + /* only called from syscall */ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) @@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr) bpf_prog_put(ptr); } +static u32 prog_fd_array_sys_lookup_elem(void *ptr) +{ + return ((struct bpf_prog *)ptr)->aux->id; +} + /* decrement refcnt of all bpf_progs that are stored in this map */ void bpf_fd_array_map_clear(struct bpf_map *map) { @@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, + .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -585,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, + .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 004334ea13ba..4fb463172aa8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1243,6 +1243,26 @@ static void fd_htab_map_free(struct bpf_map *map) htab_map_free(map); } +/* only called from syscall */ +int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +{ + void **ptr; + int ret = 0; + + if (!map->ops->map_fd_sys_lookup_elem) + return -ENOTSUPP; + + rcu_read_lock(); + ptr = htab_map_lookup_elem(map, key); + if (ptr) + *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + /* only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) @@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, + .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 59bcdf821ae4..1da574612bea 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr) */ bpf_map_put(ptr); } + +u32 bpf_map_fd_sys_lookup_elem(void *ptr) +{ + return ((struct bpf_map *)ptr)->id; +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 177fadb689dc..6183db9ec08c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); void bpf_map_fd_put_ptr(void *ptr); +u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8942c820d620..4409ccca8831 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -24,6 +24,13 @@ #include #include +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) + DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); @@ -411,6 +418,8 @@ static int map_lookup_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else if (IS_FD_MAP(map)) + value_size = sizeof(u32); else value_size = map->value_size; @@ -426,9 +435,10 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || - map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - err = -ENOTSUPP; + } else if (IS_FD_ARRAY(map)) { + err = bpf_fd_array_map_lookup_elem(map, key, value); + } else if (IS_FD_HASH(map)) { + err = bpf_fd_htab_map_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); -- cgit v1.2.3 From 8007e40a24e12d35189203370268c7278f29ab74 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 28 Jun 2017 10:41:24 -0700 Subject: bpf: Fix out-of-bound access on interpreters[] The index is off-by-one when fp->aux->stack_depth has already been rounded up to 32. In particular, if stack_depth is 512, the index will be 16. The fix is to round_up and then takes -1 instead of round_down. [ 22.318680] ================================================================== [ 22.319745] BUG: KASAN: global-out-of-bounds in bpf_prog_select_runtime+0x48a/0x670 [ 22.320737] Read of size 8 at addr ffffffff82aadae0 by task sockex3/1946 [ 22.321646] [ 22.321858] CPU: 1 PID: 1946 Comm: sockex3 Tainted: G W 4.12.0-rc6-01680-g2ee87db3a287 #22 [ 22.323061] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014 [ 22.324260] Call Trace: [ 22.324612] dump_stack+0x67/0x99 [ 22.325081] print_address_description+0x1e8/0x290 [ 22.325734] ? bpf_prog_select_runtime+0x48a/0x670 [ 22.326360] kasan_report+0x265/0x350 [ 22.326860] __asan_report_load8_noabort+0x19/0x20 [ 22.327484] bpf_prog_select_runtime+0x48a/0x670 [ 22.328109] bpf_prog_load+0x626/0xd40 [ 22.328637] ? __bpf_prog_charge+0xc0/0xc0 [ 22.329222] ? check_nnp_nosuid.isra.61+0x100/0x100 [ 22.329890] ? __might_fault+0xf6/0x1b0 [ 22.330446] ? lock_acquire+0x360/0x360 [ 22.331013] SyS_bpf+0x67c/0x24d0 [ 22.331491] ? trace_hardirqs_on+0xd/0x10 [ 22.332049] ? __getnstimeofday64+0xaf/0x1c0 [ 22.332635] ? bpf_prog_get+0x20/0x20 [ 22.333135] ? __audit_syscall_entry+0x300/0x600 [ 22.333770] ? syscall_trace_enter+0x540/0xdd0 [ 22.334339] ? exit_to_usermode_loop+0xe0/0xe0 [ 22.334950] ? do_syscall_64+0x48/0x410 [ 22.335446] ? bpf_prog_get+0x20/0x20 [ 22.335954] do_syscall_64+0x181/0x410 [ 22.336454] entry_SYSCALL64_slow_path+0x25/0x25 [ 22.337121] RIP: 0033:0x7f263fe81f19 [ 22.337618] RSP: 002b:00007ffd9a3440c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141 [ 22.338619] RAX: ffffffffffffffda RBX: 0000000000aac5fb RCX: 00007f263fe81f19 [ 22.339600] RDX: 0000000000000030 RSI: 00007ffd9a3440d0 RDI: 0000000000000005 [ 22.340470] RBP: 0000000000a9a1e0 R08: 0000000000a9a1e0 R09: 0000009d00000001 [ 22.341430] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000010000 [ 22.342411] R13: 0000000000a9a023 R14: 0000000000000001 R15: 0000000000000003 [ 22.343369] [ 22.343593] The buggy address belongs to the variable: [ 22.344241] interpreters+0x80/0x980 [ 22.344708] [ 22.344908] Memory state around the buggy address: [ 22.345556] ffffffff82aad980: 00 00 00 04 fa fa fa fa 04 fa fa fa fa fa fa fa [ 22.346449] ffffffff82aada00: 00 00 00 00 00 fa fa fa fa fa fa fa 00 00 00 00 [ 22.347361] >ffffffff82aada80: 00 00 00 00 00 00 00 00 00 00 00 00 fa fa fa fa [ 22.348301] ^ [ 22.349142] ffffffff82aadb00: 00 01 fa fa fa fa fa fa 00 00 00 00 00 00 00 00 [ 22.350058] ffffffff82aadb80: 00 00 07 fa fa fa fa fa 00 00 05 fa fa fa fa fa [ 22.350984] ================================================================== Fixes: b870aa901f4b ("bpf: use different interpreter depending on required stack size") Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 774069ca18a7..ad5f55922a13 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1297,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { - fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32]; + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during -- cgit v1.2.3 From 6bdf6abc56b53103324dfd270a86580306e1a232 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Jun 2017 03:04:59 +0200 Subject: bpf: prevent leaking pointer via xadd on unpriviledged Leaking kernel addresses on unpriviledged is generally disallowed, for example, verifier rejects the following: 0: (b7) r0 = 0 1: (18) r2 = 0xffff897e82304400 3: (7b) *(u64 *)(r1 +48) = r2 R2 leaks addr into ctx Doing pointer arithmetic on them is also forbidden, so that they don't turn into unknown value and then get leaked out. However, there's xadd as a special case, where we don't check the src reg for being a pointer register, e.g. the following will pass: 0: (b7) r0 = 0 1: (7b) *(u64 *)(r1 +48) = r0 2: (18) r2 = 0xffff897e82304400 ; map 4: (db) lock *(u64 *)(r1 +48) += r2 5: (95) exit We could store the pointer into skb->cb, loose the type context, and then read it out from there again to leak it eventually out of a map value. Or more easily in a different variant, too: 0: (bf) r6 = r1 1: (7a) *(u64 *)(r10 -8) = 0 2: (bf) r2 = r10 3: (07) r2 += -8 4: (18) r1 = 0x0 6: (85) call bpf_map_lookup_elem#1 7: (15) if r0 == 0x0 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R6=ctx R10=fp 8: (b7) r3 = 0 9: (7b) *(u64 *)(r0 +0) = r3 10: (db) lock *(u64 *)(r0 +0) += r6 11: (b7) r0 = 0 12: (95) exit from 7 to 11: R0=inv,min_value=0,max_value=0 R6=ctx R10=fp 11: (b7) r0 = 0 12: (95) exit Prevent this by checking xadd src reg for pointer types. Also add a couple of test cases related to this. Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs") Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Edward Cree Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 339c8a1371de..a8a725697bed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -989,6 +989,11 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d leaks addr into mem\n", insn->src_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); -- cgit v1.2.3 From 59494fe2c89e46e85d83de9bc45dd1d528955c49 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 29 Jun 2017 16:58:40 +0530 Subject: PM: hibernate: constify attribute_group structures. attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 6332 488 308 7128 1bd8 kernel/power/hibernate.o File size After adding 'const': text data bss dec hex filename 6396 424 308 7128 1bd8 kernel/power/hibernate.o Signed-off-by: Arvind Yadav Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a8b978c35a6a..e1914c7b85b1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1108,7 +1108,7 @@ static struct attribute * g[] = { }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -- cgit v1.2.3 From 9e52b32567126fe146f198971364f68d3bc5233f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 22 Jun 2017 11:24:42 +0200 Subject: tracing/kprobes: Allow to create probe with a module name starting with a digit Always try to parse an address, since kstrtoul() will safely fail when given a symbol as input. If that fails (which will be the case for a symbol), try to parse a symbol instead. This allows creating a probe such as: p:probe/vlan_gro_receive 8021q:vlan_gro_receive+0 Which is necessary for this command to work: perf probe -m 8021q -a vlan_gro_receive Link: http://lkml.kernel.org/r/fd72d666f45b114e2c5b9cf7e27b91de1ec966f1.1498122881.git.sd@queasysnail.net Cc: stable@vger.kernel.org Fixes: 413d37d1e ("tracing: Add kprobe-based event tracer") Acked-by: Masami Hiramatsu Signed-off-by: Sabrina Dubroca Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c129fca6ec99..b53c8d369163 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -707,20 +707,16 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - if (isdigit(argv[1][0])) { - /* an address specified */ - ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); - if (ret) { - pr_info("Failed to parse address.\n"); - return ret; - } - } else { + + /* try to parse an address. if that fails, try to read the + * input as a symbol. */ + if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { /* a symbol specified */ symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret) { - pr_info("Failed to parse symbol.\n"); + pr_info("Failed to parse either an address or a symbol.\n"); return ret; } if (offset && is_return && -- cgit v1.2.3 From a9bd8dfa539493db265e46a496c1a89279ab31d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 May 2017 18:39:01 -0400 Subject: kimage_file_prepare_segments(): don't open-code memdup_user() Signed-off-by: Al Viro --- kernel/kexec_file.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b118735fea9d..766e7e4d3ad9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -162,16 +162,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } if (cmdline_len) { - image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); - if (!image->cmdline_buf) { - ret = -ENOMEM; - goto out; - } - - ret = copy_from_user(image->cmdline_buf, cmdline_ptr, - cmdline_len); - if (ret) { - ret = -EFAULT; + image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); + if (IS_ERR(image->cmdline_buf)) { + ret = PTR_ERR(image->cmdline_buf); + image->cmdline_buf = NULL; goto out; } -- cgit v1.2.3 From e4448ed87ccdbacb74871736f63220642242b32f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 May 2017 18:43:00 -0400 Subject: bpf: don't open-code memdup_user() Signed-off-by: Al Viro --- kernel/bpf/syscall.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fd2411fd6914..4b8b10bddfde 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -322,14 +322,11 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -402,14 +399,11 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -488,14 +482,11 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } preempt_disable(); __this_cpu_inc(bpf_prog_active); @@ -507,7 +498,6 @@ static int map_delete_elem(union bpf_attr *attr) if (!err) trace_bpf_map_delete_elem(map, ufd, key); -free_key: kfree(key); err_put: fdput(f); @@ -536,14 +526,11 @@ static int map_get_next_key(union bpf_attr *attr) return PTR_ERR(map); if (ukey) { - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } } else { key = NULL; } -- cgit v1.2.3 From 48365b38849fdb1ee6dc65beac044ca59f669683 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 26 Jun 2017 17:07:14 +0200 Subject: sched/debug: Expose the number of RT/DL tasks that can migrate Add the value of the rt_rq.rt_nr_migratory and dl_rq.dl_nr_migratory to the sched_debug output, for instance: rt_rq[0]: .rt_nr_running : 2 .rt_nr_migratory : 1 <--- Like this .rt_throttled : 0 .rt_time : 828.645877 .rt_runtime : 1000.000000 This is useful to debug problems related to the RT/DL schedulers. This also fixes the format of some variables, that were unsigned, rather than signed. Signed-off-by: Daniel Bristot de Oliveira Cc: Clark Williams Cc: Linus Torvalds Cc: Luis Claudio R. Goncalves Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: linux-rt-users Link: http://lkml.kernel.org/r/7896f71cada54ee7dd8507bb666063a2e051c3d4.1498482127.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 38f019324f1a..4fa66de52bd6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -552,15 +552,21 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PU(x) \ + SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x)) #define PN(x) \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) - P(rt_nr_running); + PU(rt_nr_running); +#ifdef CONFIG_SMP + PU(rt_nr_migratory); +#endif P(rt_throttled); PN(rt_time); PN(rt_runtime); #undef PN +#undef PU #undef P } @@ -569,14 +575,21 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) struct dl_bw *dl_bw; SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); - SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); + +#define PU(x) \ + SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) + + PU(dl_nr_running); #ifdef CONFIG_SMP + PU(dl_nr_migratory); dl_bw = &cpu_rq(cpu)->rd->dl_bw; #else dl_bw = &dl_rq->dl_bw; #endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); + +#undef PU } extern __read_mostly int sched_clock_running; -- cgit v1.2.3 From 993647a293814dd47ae41d38657fda6e4ab04e33 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 29 Jun 2017 17:40:47 +0530 Subject: cpu/hotplug: Constify attribute_group structures attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const: File size before: text data bss dec hex filename 12582 15361 20 27963 6d3b kernel/cpu.o File size After adding 'const': text data bss dec hex filename 12710 15265 20 27995 6d5b kernel/cpu.o Signed-off-by: Arvind Yadav Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: anna-maria@linutronix.de Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: rcochran@linutronix.de Link: http://lkml.kernel.org/r/f9079e94e12b36d245e7adbf67d312bc5d0250c6.1498737970.git.arvind.yadav.cs@gmail.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d0f5f54aa087..b69c0588f8c9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1629,7 +1629,7 @@ static struct attribute *cpuhp_cpu_attrs[] = { NULL }; -static struct attribute_group cpuhp_cpu_attr_group = { +static const struct attribute_group cpuhp_cpu_attr_group = { .attrs = cpuhp_cpu_attrs, .name = "hotplug", NULL @@ -1661,7 +1661,7 @@ static struct attribute *cpuhp_cpu_root_attrs[] = { NULL }; -static struct attribute_group cpuhp_cpu_root_attr_group = { +static const struct attribute_group cpuhp_cpu_root_attr_group = { .attrs = cpuhp_cpu_root_attrs, .name = "hotplug", NULL -- cgit v1.2.3 From 72298e5c92c50edd8cb7cfda4519483ce65fa166 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 29 Jun 2017 13:41:28 -0500 Subject: sched/cputime: Refactor the cputime_adjust() code Address a Coverity false positive, which is caused by overly convoluted code: Value assigned to variable 'utime' at line 619:utime = rtime; is overwritten at line 642:utime = rtime - stime; before it can be used. This makes such variable assignment useless. Remove this variable assignment and refactor the code related. Addresses-Coverity-ID: 1371643 Signed-off-by: Gustavo A. R. Silva Cc: Frans Klaver Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/20170629184128.GA5271@embeddedgus Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index aea3135c5d90..67c70e287647 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -615,19 +615,13 @@ static void cputime_adjust(struct task_cputime *curr, * userspace. Once a task gets some ticks, the monotonicy code at * 'update' will ensure things converge to the observed ratio. */ - if (stime == 0) { - utime = rtime; - goto update; + if (stime != 0) { + if (utime == 0) + stime = rtime; + else + stime = scale_stime(stime, rtime, stime + utime); } - if (utime == 0) { - stime = rtime; - goto update; - } - - stime = scale_stime(stime, rtime, stime + utime); - -update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. -- cgit v1.2.3 From 5c4994102fb508d4a0f7a8afa46560c314c1ebd4 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:05 -0700 Subject: posix-timers: Use get_timespec64() and put_timespec64() Usage of these apis and their compat versions makes the syscalls: clock_gettime, clock_settime, clock_getres and their compat implementations simpler. This is a preparatory patch to isolate data conversions to struct timespec64 at userspace boundaries. This helps contain the changes needed to transition to new y2038 safe types. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-stubs.c | 83 +++++++++++++++++++++++++--------------------- kernel/time/posix-timers.c | 63 ++++++++++++++--------------------- 2 files changed, 70 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 65878221cbfb..06f34feb635e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -51,40 +51,52 @@ SYS_NI(alarm); SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (copy_from_user(&new_tp, tp, sizeof (*tp))) + if (get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return do_sys_settimeofday64(&new_tp64, NULL); + return do_sys_settimeofday64(&new_tp, NULL); } -SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) +int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) { - struct timespec64 kernel_tp64; - struct timespec kernel_tp; - switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; - case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; - case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; - default: return -EINVAL; + case CLOCK_REALTIME: + ktime_get_real_ts64(tp); + break; + case CLOCK_MONOTONIC: + ktime_get_ts64(tp); + break; + case CLOCK_BOOTTIME: + get_monotonic_boottime64(tp); + break; + default: + return -EINVAL; } - kernel_tp = timespec64_to_timespec(kernel_tp64); - if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + return 0; +} +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *, tp) +{ + int ret; + struct timespec64 kernel_tp; + + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; + + if (put_timespec64(&kernel_tp, tp)) return -EFAULT; return 0; } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { - struct timespec rtn_tp = { + struct timespec64 rtn_tp = { .tv_sec = 0, .tv_nsec = hrtimer_resolution, }; @@ -93,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) + if (put_timespec64(&rtn_tp, tp)) return -EFAULT; return 0; default: @@ -142,41 +154,35 @@ COMPAT_SYS_NI(setitimer); COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, struct compat_timespec __user *, tp) { - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (compat_get_timespec(&new_tp, tp)) + if (compat_get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return do_sys_settimeofday64(&new_tp64, NULL); + return do_sys_settimeofday64(&new_tp, NULL); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct compat_timespec __user *,tp) +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { - struct timespec64 kernel_tp64; - struct timespec kernel_tp; + int ret; + struct timespec64 kernel_tp; - switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; - case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; - case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; - default: return -EINVAL; - } + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; - kernel_tp = timespec64_to_timespec(kernel_tp64); - if (compat_put_timespec(&kernel_tp, tp)) + if (compat_put_timespec64(&kernel_tp, tp)) return -EFAULT; return 0; } -COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { - struct timespec rtn_tp = { + struct timespec64 rtn_tp = { .tv_sec = 0, .tv_nsec = hrtimer_resolution, }; @@ -185,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (compat_put_timespec(&rtn_tp, tp)) + if (compat_put_timespec64(&rtn_tp, tp)) return -EFAULT; return 0; default: return -EINVAL; } } + COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 82d67be7d9d1..39322ae5dd87 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1049,34 +1049,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (!kc || !kc->clock_set) return -EINVAL; - if (copy_from_user(&new_tp, tp, sizeof (*tp))) + if (get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return kc->clock_set(which_clock, &new_tp64); + return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 kernel_tp64; - struct timespec kernel_tp; + struct timespec64 kernel_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp64); - kernel_tp = timespec64_to_timespec(kernel_tp64); + error = kc->clock_get(which_clock, &kernel_tp); - if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; return error; @@ -1109,17 +1105,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 rtn_tp64; - struct timespec rtn_tp; + struct timespec64 rtn_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp64); - rtn_tp = timespec64_to_timespec(rtn_tp64); + error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + if (!error && tp && put_timespec64(&rtn_tp, tp)) error = -EFAULT; return error; @@ -1131,38 +1125,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 ts; if (!kc || !kc->clock_set) return -EINVAL; - if (compat_get_timespec(&new_tp, tp)) + if (compat_get_timespec64(&ts, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - - return kc->clock_set(which_clock, &new_tp64); + return kc->clock_set(which_clock, &ts); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 kernel_tp64; - struct timespec kernel_tp; - int error; + struct timespec64 ts; + int err; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp64); - kernel_tp = timespec64_to_timespec(kernel_tp64); + err = kc->clock_get(which_clock, &ts); - if (!error && compat_put_timespec(&kernel_tp, tp)) - error = -EFAULT; + if (!err && compat_put_timespec64(&ts, tp)) + err = -EFAULT; - return error; + return err; } COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, @@ -1193,21 +1182,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 rtn_tp64; - struct timespec rtn_tp; - int error; + struct timespec64 ts; + int err; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp64); - rtn_tp = timespec64_to_timespec(rtn_tp64); - - if (!error && tp && compat_put_timespec(&rtn_tp, tp)) - error = -EFAULT; + err = kc->clock_getres(which_clock, &ts); + if (!err && tp && compat_put_timespec64(&ts, tp)) + return -EFAULT; - return error; + return err; } + #endif /* -- cgit v1.2.3 From c0edd7c9acd0eaee149ab6cb4441cc71a1af87f0 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:06 -0700 Subject: nanosleep: Use get_timespec64() and put_timespec64() Usage of these apis and their compat versions makes the syscalls: clock_nanosleep and nanosleep and their compat implementations simpler. This is a preparatory patch to isolate data conversions to struct timespec64 at userspace boundaries. This helps contain the changes needed to transition to new y2038 safe types. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/alarmtimer.c | 4 ++-- kernel/time/hrtimer.c | 30 +++++++++++++----------------- kernel/time/posix-cpu-timers.c | 8 ++------ kernel/time/posix-timers.c | 20 ++++++++------------ 4 files changed, 25 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c991cf212c6d..0b8ff7d257ea 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -712,14 +712,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, alarmtimer_freezerset(absexp, type); restart = ¤t->restart_block; if (restart->nanosleep.type != TT_NONE) { - struct timespec rmt; + struct timespec64 rmt; ktime_t rem; rem = ktime_sub(absexp, alarm_bases[type].gettime()); if (rem <= 0) return 0; - rmt = ktime_to_timespec(rem); + rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 81da124f1115..88f75f92ef36 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); -int nanosleep_copyout(struct restart_block *restart, struct timespec *ts) +int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT case TT_COMPAT: - if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp)) + if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: - if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec))) + if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: @@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod restart = ¤t->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); - struct timespec rmt; + struct timespec64 rmt; if (rem <= 0) return 0; - rmt = ktime_to_timespec(rem); + rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } @@ -1546,19 +1546,17 @@ out: SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct timespec64 tu64; - struct timespec tu; + struct timespec64 tu; - if (copy_from_user(&tu, rqtp, sizeof(tu))) + if (get_timespec64(&tu, rqtp)) return -EFAULT; - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) + if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #ifdef CONFIG_COMPAT @@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { - struct timespec64 tu64; - struct timespec tu; + struct timespec64 tu; - if (compat_get_timespec(&tu, rqtp)) + if (compat_get_timespec64(&tu, rqtp)) return -EFAULT; - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) + if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 9df618ee64cf..7323da5950cc 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1314,12 +1314,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, */ restart = ¤t->restart_block; restart->nanosleep.expires = expires; - if (restart->nanosleep.type != TT_NONE) { - struct timespec ts; - - ts = timespec64_to_timespec(it.it_value); - error = nanosleep_copyout(restart, &ts); - } + if (restart->nanosleep.type != TT_NONE) + error = nanosleep_copyout(restart, &it.it_value); } return error; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 39322ae5dd87..4b0fc3b0a1c4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1213,26 +1213,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 t64; - struct timespec t; + struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -ENANOSLEEP_NOTSUP; - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + if (get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64); + return kc->nsleep(which_clock, flags, &t); } #ifdef CONFIG_COMPAT @@ -1241,26 +1239,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 t64; - struct timespec t; + struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -ENANOSLEEP_NOTSUP; - if (compat_get_timespec(&t, rqtp)) + if (compat_get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64); + return kc->nsleep(which_clock, flags, &t); } #endif -- cgit v1.2.3 From 725816e8aabb1c183baa2bc9572ab9a0d26b9ea1 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:08 -0700 Subject: posix_clocks: Use get_itimerspec64() and put_itimerspec64() Usage of these apis and their compat versions makes the syscalls: timer_settime and timer_gettime and their compat implementations simpler. This patch also serves as a preparatory patch for changing syscalls to use new time_t data types to support the y2038 effort by isolating the processing of user pointers through these apis. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-timers.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4b0fc3b0a1c4..13d6881f908b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { - struct itimerspec64 cur_setting64; + struct itimerspec64 cur_setting; - int ret = do_timer_gettime(timer_id, &cur_setting64); + int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - struct itimerspec cur_setting; - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (put_itimerspec64(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct compat_itimerspec __user *, setting) { - struct itimerspec64 cur_setting64; + struct itimerspec64 cur_setting; - int ret = do_timer_gettime(timer_id, &cur_setting64); + int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - struct itimerspec cur_setting; - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (put_compat_itimerspec(setting, &cur_setting)) + if (put_compat_itimerspec64(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct itimerspec __user *, new_setting, struct itimerspec __user *, old_setting) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; - struct itimerspec new_spec; + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; int error = 0; if (!new_setting) return -EINVAL; - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { - struct itimerspec old_spec; - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + if (put_itimerspec64(&old_spec, old_setting)) error = -EFAULT; } return error; @@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, struct compat_itimerspec __user *, new, struct compat_itimerspec __user *, old) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old ? &old_spec64 : NULL; - struct itimerspec new_spec; + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old ? &old_spec : NULL; int error = 0; if (!new) return -EINVAL; - if (get_compat_itimerspec(&new_spec, new)) + if (get_compat_itimerspec64(&new_spec, new)) return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { - struct itimerspec old_spec; - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (put_compat_itimerspec(old, &old_spec)) + if (put_compat_itimerspec64(&old_spec, old)) error = -EFAULT; } return error; -- cgit v1.2.3 From c207aee48037abca71c669cbec407b9891965c34 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 28 Jun 2017 10:11:06 -0500 Subject: objtool, x86: Add several functions and files to the objtool whitelist In preparation for an objtool rewrite which will have broader checks, whitelist functions and files which cause problems because they do unusual things with the stack. These whitelists serve as a TODO list for which functions and files don't yet have undwarf unwinder coverage. Eventually most of the whitelists can be removed in favor of manual CFI hint annotations or objtool improvements. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Jiri Slaby Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/7f934a5d707a574bda33ea282e9478e627fb1829.1498659915.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/kexec_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index ae1a3ba24df5..154ffb489b93 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -874,7 +875,7 @@ int kexec_load_disabled; * only when panic_cpu holds the current CPU number; this is the only CPU * which processes crash_kexec routines. */ -void __crash_kexec(struct pt_regs *regs) +void __noclone __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel @@ -896,6 +897,7 @@ void __crash_kexec(struct pt_regs *regs) mutex_unlock(&kexec_mutex); } } +STACK_FRAME_NON_STANDARD(__crash_kexec); void crash_kexec(struct pt_regs *regs) { -- cgit v1.2.3 From 3859a271a003aba01e45b85c9d8b355eb7bf25f9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Oct 2016 01:22:25 -0700 Subject: randstruct: Mark various structs for randomization This marks many critical kernel structures for randomization. These are structures that have been targeted in the past in security exploits, or contain functions pointers, pointers to function pointer tables, lists, workqueues, ref-counters, credentials, permissions, or are otherwise sensitive. This initial list was extracted from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Left out of this list is task_struct, which requires special handling and will be covered in a subsequent patch. Signed-off-by: Kees Cook --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..5616511abf39 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -212,7 +212,7 @@ struct futex_pi_state { atomic_t refcount; union futex_key key; -}; +} __randomize_layout; /** * struct futex_q - The hashed futex queue entry, one per waiting task @@ -246,7 +246,7 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; -}; +} __randomize_layout; static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ -- cgit v1.2.3 From 40304b2a1567fecc321f640ee4239556dd0f3ee0 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 30 Jun 2017 20:02:40 -0700 Subject: bpf: BPF support for sock_ops Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding struct that allows BPF programs of this type to access some of the socket's fields (such as IP addresses, ports, etc.). It uses the existing bpf cgroups infrastructure so the programs can be attached per cgroup with full inheritance support. The program will be called at appropriate times to set relevant connections parameters such as buffer sizes, SYN and SYN-ACK RTOs, etc., based on connection information such as IP addresses, port numbers, etc. Alghough there are already 3 mechanisms to set parameters (sysctls, route metrics and setsockopts), this new mechanism provides some distinct advantages. Unlike sysctls, it can set parameters per connection. In contrast to route metrics, it can also use port numbers and information provided by a user level program. In addition, it could set parameters probabilistically for evaluation purposes (i.e. do something different on 10% of the flows and compare results with the other 90% of the flows). Also, in cases where IPv6 addresses contain geographic information, the rules to make changes based on the distance (or RTT) between the hosts are much easier than route metric rules and can be global. Finally, unlike setsockopt, it oes not require application changes and it can be updated easily at any time. Although the bpf cgroup framework already contains a sock related program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type (BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called only once during the connections's lifetime. In contrast, the new program type will be called multiple times from different places in the network stack code. For example, before sending SYN and SYN-ACKs to set an appropriate timeout, when the connection is established to set congestion control, etc. As a result it has "op" field to specify the type of operation requested. The purpose of this new program type is to simplify setting connection parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is easy to use facebook's internal IPv6 addresses to determine if both hosts of a connection are in the same datacenter. Therefore, it is easy to write a BPF program to choose a small SYN RTO value when both hosts are in the same datacenter. This patch only contains the framework to support the new BPF program type, following patches add the functionality to set various connection parameters. This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS and a new bpf syscall command to load a new program of this type: BPF_PROG_LOAD_SOCKET_OPS. Two new corresponding structs (one for the kernel one for the user/BPF program): /* kernel version */ struct bpf_sock_ops_kern { struct sock *sk; __u32 op; union { __u32 reply; __u32 replylong[4]; }; }; /* user version * Some fields are in network byte order reflecting the sock struct * Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to * convert them to host byte order. */ struct bpf_sock_ops { __u32 op; union { __u32 reply; __u32 replylong[4]; }; __u32 family; __u32 remote_ip4; /* In network byte order */ __u32 local_ip4; /* In network byte order */ __u32 remote_ip6[4]; /* In network byte order */ __u32 local_ip6[4]; /* In network byte order */ __u32 remote_port; /* In network byte order */ __u32 local_port; /* In host byte horder */ }; Currently there are two types of ops. The first type expects the BPF program to return a value which is then used by the caller (or a negative value to indicate the operation is not supported). The second type expects state changes to be done by the BPF program, for example through a setsockopt BPF helper function, and they ignore the return value. The reply fields of the bpf_sockt_ops struct are there in case a bpf program needs to return a value larger than an integer. Signed-off-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 5 +++++ 2 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ea6033cba947..546113430049 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); + +/** + * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock + * @sk: socket to get cgroup from + * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains + * sk with connection information (IP addresses, etc.) May not contain + * cgroup info if it is a req sock. + * @type: The type of program to be exectuted + * + * socket passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock_ops + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, + struct bpf_sock_ops_kern *sock_ops, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4409ccca8831..d4d47de75bba 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_SOCK_OPS: + ptype = BPF_PROG_TYPE_SOCK_OPS; + break; default: return -EINVAL; } @@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); @@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } + #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration -- cgit v1.2.3 From f96da09473b52c09125cc9bf7d7d4576ae8229e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 2 Jul 2017 02:13:27 +0200 Subject: bpf: simplify narrower ctx access This work tries to make the semantics and code around the narrower ctx access a bit easier to follow. Right now everything is done inside the .is_valid_access(). Offset matching is done differently for read/write types, meaning writes don't support narrower access and thus matching only on offsetof(struct foo, bar) is enough whereas for read case that supports narrower access we must check for offsetof(struct foo, bar) + offsetof(struct foo, bar) + sizeof() - 1 for each of the cases. For read cases of individual members that don't support narrower access (like packet pointers or skb->cb[] case which has its own narrow access logic), we check as usual only offsetof(struct foo, bar) like in write case. Then, for the case where narrower access is allowed, we also need to set the aux info for the access. Meaning, ctx_field_size and converted_op_size have to be set. First is the original field size e.g. sizeof() as in above example from the user facing ctx, and latter one is the target size after actual rewrite happened, thus for the kernel facing ctx. Also here we need the range match and we need to keep track changing convert_ctx_access() and converted_op_size from is_valid_access() as both are not at the same location. We can simplify the code a bit: check_ctx_access() becomes simpler in that we only store ctx_field_size as a meta data and later in convert_ctx_accesses() we fetch the target_size right from the location where we do convert. Should the verifier be misconfigured we do reject for BPF_WRITE cases or target_size that are not provided. For the subsystems, we always work on ranges in is_valid_access() and add small helpers for ranges and narrow access, convert_ctx_accesses() sets target_size for the relevant instruction. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Cc: Yonghong Song Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 78 +++++++++++++++++++++--------------------------- kernel/trace/bpf_trace.c | 31 ++++++++----------- 2 files changed, 46 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ea2adcb233b..6f820a044079 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -546,20 +546,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, return 0; } -static int bpf_size_to_bytes(int bpf_size) -{ - if (bpf_size == BPF_W) - return 4; - else if (bpf_size == BPF_H) - return 2; - else if (bpf_size == BPF_B) - return 1; - else if (bpf_size == BPF_DW) - return 8; - else - return -EINVAL; -} - static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { @@ -761,7 +747,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { - struct bpf_insn_access_aux info = { .reg_type = *reg_type }; + struct bpf_insn_access_aux info = { + .reg_type = *reg_type, + }; /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) @@ -769,25 +757,14 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, if (env->prog->aux->ops->is_valid_access && env->prog->aux->ops->is_valid_access(off, size, t, &info)) { - /* a non zero info.ctx_field_size indicates: - * . For this field, the prog type specific ctx conversion algorithm - * only supports whole field access. - * . This ctx access is a candiate for later verifier transformation - * to load the whole field and then apply a mask to get correct result. - * a non zero info.converted_op_size indicates perceived actual converted - * value width in convert_ctx_access. + /* A non zero info.ctx_field_size indicates that this field is a + * candidate for later verifier transformation to load the whole + * field and then apply a mask when accessed with a narrower + * access than actual ctx access size. A zero info.ctx_field_size + * will only allow for whole field access and rejects any other + * type of narrower access. */ - if ((info.ctx_field_size && !info.converted_op_size) || - (!info.ctx_field_size && info.converted_op_size)) { - verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n", - env->prog->type, off, size); - return -EACCES; - } - - if (info.ctx_field_size) { - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; - env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size; - } + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; *reg_type = info.reg_type; /* remember the offset of last byte accessed in ctx */ @@ -3401,11 +3378,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of static int convert_ctx_accesses(struct bpf_verifier_env *env) { const struct bpf_verifier_ops *ops = env->prog->aux->ops; + int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0; + bool is_narrower_load; + u32 target_size; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3445,39 +3424,50 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; - off = insn->off; - size = bpf_size_to_bytes(BPF_SIZE(insn->code)); ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; - converted_op_size = env->insn_aux_data[i + delta].converted_op_size; - is_narrower_load = type == BPF_READ && size < ctx_field_size; + size = BPF_LDST_BYTES(insn); /* If the read access is a narrower load of the field, * convert to a 4/8-byte load, to minimum program type specific * convert_ctx_access changes. If conversion is successful, * we will apply proper mask to the result. */ + is_narrower_load = size < ctx_field_size; if (is_narrower_load) { - int size_code = BPF_H; + u32 off = insn->off; + u8 size_code; + + if (type == BPF_WRITE) { + verbose("bpf verifier narrow ctx access misconfigured\n"); + return -EINVAL; + } + size_code = BPF_H; if (ctx_field_size == 4) size_code = BPF_W; else if (ctx_field_size == 8) size_code = BPF_DW; + insn->off = off & ~(ctx_field_size - 1); insn->code = BPF_LDX | BPF_MEM | size_code; } - cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + + target_size = 0; + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || + (ctx_field_size && !target_size)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - if (is_narrower_load && size < converted_op_size) { + + if (is_narrower_load && size < target_size) { if (ctx_field_size <= 4) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1 << size * 8) - 1); else insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1 << size * 8) - 1); } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 97c46b440cd6..5c6d538dbf43 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -583,7 +583,8 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - int sample_period_off; + const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, + sample_period); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -592,43 +593,35 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type if (off % size != 0) return false; - /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ - sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); - if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { - int allowed; - -#ifdef __LITTLE_ENDIAN - allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; -#else - allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; -#endif - if (!allowed) + switch (off) { + case bpf_ctx_range(struct bpf_perf_event_data, sample_period): + bpf_ctx_record_field_size(info, size_sp); + if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) return false; - info->ctx_field_size = 8; - info->converted_op_size = 8; - } else { + break; + default: if (size != sizeof(long)) return false; } + return true; } static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): - BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, - offsetof(struct perf_sample_data, period)); + bpf_target_off(struct perf_sample_data, period, 8, + target_size)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, -- cgit v1.2.3 From 9780c0ab1a4e64ef6998c4d83f9df5be806a02dc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 2 Jul 2017 02:13:28 +0200 Subject: bpf: export whether tail call has jited owner We do export through fdinfo already whether a prog is JITed or not, given a program load can fail in case of either prog or tail call map has JITed property, but neither both are JITed or not JITed, we can facilitate error reporting in loaders like iproute2 through exporting owner_jited of tail call map. We already do export owner_prog_type through this facility, so parser can pick up both for comparison. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d4d47de75bba..18980472f5b0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -216,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) const struct bpf_map *map = filp->private_data; const struct bpf_array *array; u32 owner_prog_type = 0; + u32 owner_jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); owner_prog_type = array->owner_prog_type; + owner_jited = array->owner_jited; } seq_printf(m, @@ -236,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->map_flags, map->pages * 1ULL << PAGE_SHIFT); - if (owner_prog_type) + if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", owner_prog_type); + seq_printf(m, "owner_jited:\t%u\n", + owner_jited); + } } #endif -- cgit v1.2.3 From 7bda4b40c5624c3f1c69227f8ebfd46a4b83f2ef Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 2 Jul 2017 02:13:29 +0200 Subject: bpf: extend bpf_trace_printk to support %i Currently, bpf_trace_printk does not support common formatting symbol '%i' however vsprintf does and is what eventually gets called by bpf helper. If users are used to '%i' and currently make use of it, then bpf_trace_printk will just return with error without dumping anything to the trace pipe, so just add support for '%i' to the helper. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5c6d538dbf43..37385193a608 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) } /* - * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed + * Only limited trace_printk() conversion specifiers allowed: + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) @@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, i++; } - if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') return -EINVAL; fmt_cnt++; } -- cgit v1.2.3 From 43188702b3d98d2792969a3377a30957f05695e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 2 Jul 2017 02:13:30 +0200 Subject: bpf, verifier: add additional patterns to evaluate_reg_imm_alu Currently the verifier does not track imm across alu operations when the source register is of unknown type. This adds additional pattern matching to catch this and track imm. We've seen LLVM generating this pattern while working on cilium. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f820a044079..6a86723c5b64 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1657,6 +1657,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + u8 opcode = BPF_OP(insn->code); + s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm); + + /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */ + if (src_reg->imm > 0 && dst_reg->imm) { + switch (opcode) { + case BPF_ADD: + /* dreg += sreg + * where both have zero upper bits. Adding them + * can only result making one more bit non-zero + * in the larger value. + * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) + * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) + */ + dst_reg->imm = min(src_reg->imm, 63 - imm_log2); + dst_reg->imm--; + break; + case BPF_AND: + /* dreg &= sreg + * AND can not extend zero bits only shrink + * Ex. 0x00..00ffffff + * & 0x0f..ffffffff + * ---------------- + * 0x00..00ffffff + */ + dst_reg->imm = max(src_reg->imm, 63 - imm_log2); + break; + case BPF_OR: + /* dreg |= sreg + * OR can only extend zero bits + * Ex. 0x00..00ffffff + * | 0x0f..ffffffff + * ---------------- + * 0x0f..00ffffff + */ + dst_reg->imm = min(src_reg->imm, 63 - imm_log2); + break; + case BPF_SUB: + case BPF_MUL: + case BPF_RSH: + case BPF_LSH: + /* These may be flushed out later */ + default: + mark_reg_unknown_value(regs, insn->dst_reg); + } + } else { + mark_reg_unknown_value(regs, insn->dst_reg); + } + + dst_reg->type = UNKNOWN_VALUE; + return 0; +} + static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1666,6 +1725,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); u64 dst_imm = dst_reg->imm; + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE) + return evaluate_reg_imm_alu_unknown(env, insn); + /* dst_reg->type == CONST_IMM here. Simulate execution of insns * containing ALU ops. Don't care about overflow or negative * values, just add/sub/... them; registers are in u64. -- cgit v1.2.3 From 3b9c08ae3dd44201b3a188aef34d6ddf73434015 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 4 Jul 2017 11:53:40 +0200 Subject: Revert "sched/cputime: Refactor the cputime_adjust() code" This reverts commit 72298e5c92c50edd8cb7cfda4519483ce65fa166. As Peter explains: > Argh, no... That code was perfectly fine. The new code otoh is > convoluted. > > The old code had the following form: > > if (exception1) > deal with exception1 > > if (execption2) > deal with exception2 > > do normal stuff > > Which is as simple and straight forward as it gets. > > The new code otoh reads like: > > if (!exception1) { > if (exception2) > deal with exception 2 > else > do normal stuff > } So restore the old form. Also fix the comment describing the logic, as it was confusing. Requested-by: Peter Zijlstra Cc: Gustavo A. R. Silva Cc: Frans Klaver Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Rik van Riel Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Wanpeng Li Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 67c70e287647..84a419bdf5aa 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr, utime = curr->utime; /* - * If either stime or both stime and utime are 0, assume all runtime is - * userspace. Once a task gets some ticks, the monotonicy code at - * 'update' will ensure things converge to the observed ratio. + * If either stime or utime are 0, assume all runtime is userspace. + * Once a task gets some ticks, the monotonicy code at 'update:' + * will ensure things converge to the observed ratio. */ - if (stime != 0) { - if (utime == 0) - stime = rtime; - else - stime = scale_stime(stime, rtime, stime + utime); + if (stime == 0) { + utime = rtime; + goto update; } + if (utime == 0) { + stime = rtime; + goto update; + } + + stime = scale_stime(stime, rtime, stime + utime); + +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. -- cgit v1.2.3 From e5682b4eecb2b73282853d0ef314d3164b986997 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 4 Jul 2017 11:25:15 +0200 Subject: genirq/debugfs: Fix build for !CONFIG_IRQ_DOMAIN Fix this build error: kernel/irq/internals.h:440:20: error: inlining failed in call to always_inline 'irq_domain_debugfs_init': function body not available kernel/irq/debugfs.c:202:2: note: called from here irq_domain_debugfs_init(root_dir); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Sebastian Ott Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1707041124000.1712@schleppi --- kernel/irq/internals.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9da14d125df4..dbfba9933ed2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -437,7 +437,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc) # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else -static inline void irq_domain_debugfs_init(struct dentry *root); +static inline void irq_domain_debugfs_init(struct dentry *root) +{ +} # endif #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) -- cgit v1.2.3 From 2372a519f63829b8effcdde5f4564a7e036294f0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 4 Jul 2017 12:06:01 +0200 Subject: genirq: Force inlining of __irq_startup_managed to prevent build failure If CONFIG_SMP=n, and gcc (e.g. 4.1.2) decides not to inline __irq_startup_managed(), the build fails with: kernel/built-in.o: In function `irq_startup': (.text+0x38ed8): undefined reference to `irq_set_affinity_locked' Fix this by forcing inlining of __irq_startup_managed(). Fixes: 761ea388e8c4e3ac ("genirq: Handle managed irqs gracefully in irq_startup()") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1499162761-12398-1-git-send-email-geert@linux-m68k.org --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2e30d925a40d..aa5497dfb29e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -234,7 +234,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) return IRQ_STARTUP_MANAGED; } #else -static int +static __always_inline int __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) { return IRQ_STARTUP_NORMAL; -- cgit v1.2.3 From 3a90795e1e885167209056a1a90be965add30e25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:36 +0200 Subject: genirq: Move bus locking into __setup_irq() There is no point in having the irq_bus_lock() protection around all callers to __setup_irq(). Move it into __setup_irq(). This is also a preparatory patch for addressing the issues with the irq resource callbacks. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214343.960949031@linutronix.de --- kernel/irq/manage.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5c11c1730ba5..0934e02fa04e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1167,6 +1167,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + chip_bus_lock(desc); + /* * The following block of code has to be executed atomically */ @@ -1347,6 +1349,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); irq_setup_timings(desc, new); @@ -1378,6 +1381,8 @@ mismatch: out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); + out_thread: if (new->thread) { struct task_struct *t = new->thread; @@ -1417,9 +1422,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; - chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); if (retval) irq_chip_pm_put(&desc->irq_data); @@ -1674,9 +1677,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, return retval; } - chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); if (retval) { irq_chip_pm_put(&desc->irq_data); @@ -1924,9 +1925,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; - chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); if (retval) irq_chip_pm_put(&desc->irq_data); @@ -1980,9 +1979,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } - chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); if (retval) { irq_chip_pm_put(&desc->irq_data); -- cgit v1.2.3 From 9114014cf4e6df0b22d764380ae1fc54f1a7a8b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:37 +0200 Subject: genirq: Add mutex to irq desc to serialize request/free_irq() The irq_request/release_resources() callbacks ar currently invoked under desc->lock with interrupts disabled. This is a source of problems on RT and conceptually not required. Add a seperate mutex to struct irq_desc which allows to serialize request/free_irq(), which can be used to move the resource functions out of the desc->lock held region. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.039220922@linutronix.de --- kernel/irq/irqdesc.c | 1 + kernel/irq/manage.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 948b50e78549..906a67e58391 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); + mutex_init(&desc->request_mutex); init_rcu_head(&desc->rcu); desc_set_defaults(irq, desc, node, affinity, owner); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0934e02fa04e..0139908b8d53 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1167,6 +1167,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + mutex_lock(&desc->request_mutex); + chip_bus_lock(desc); /* @@ -1350,6 +1352,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); @@ -1383,6 +1386,8 @@ out_unlock: chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); + out_thread: if (new->thread) { struct task_struct *t = new->thread; @@ -1446,6 +1451,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; + mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); @@ -1521,6 +1527,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + mutex_unlock(&desc->request_mutex); + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); -- cgit v1.2.3 From 46e48e257360f0845fe17089713cbad4db611e70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:38 +0200 Subject: genirq: Move irq resource handling out of spinlocked region Aside of being conceptually wrong, there is also an actual (hard to trigger and mostly theoretical) problem. CPU0 CPU1 free_irq(X) interrupt X spin_lock(desc->lock) wake irq thread() spin_unlock(desc->lock) spin_lock(desc->lock) remove action() shutdown_irq() release_resources() thread_handler() spin_unlock(desc->lock) access released resources. synchronize_irq() Move the release resources invocation after synchronize_irq() so it's guaranteed that the threaded handler has finished. Move the resource request call out of the desc->lock held region as well, so the invocation context is the same for both request and release. This solves the problems with those functions on RT as well. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.117028181@linutronix.de --- kernel/irq/manage.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0139908b8d53..3e693430bfe1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1168,6 +1168,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags &= ~IRQF_ONESHOT; mutex_lock(&desc->request_mutex); + if (!desc->action) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_mutex; + } + } chip_bus_lock(desc); @@ -1271,13 +1279,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - ret = irq_request_resources(desc); - if (ret) { - pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", - new->name, irq, desc->irq_data.chip->name); - goto out_unlock; - } - init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1386,6 +1387,10 @@ out_unlock: chip_bus_sync_unlock(desc); + if (!desc->action) + irq_release_resources(desc); + +out_mutex: mutex_unlock(&desc->request_mutex); out_thread: @@ -1484,7 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc->action) { irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); - irq_release_resources(desc); irq_remove_timings(desc); } @@ -1527,6 +1531,9 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + if (!desc->action) + irq_release_resources(desc); + mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); -- cgit v1.2.3 From 2343877fbda701599653e63f8dcc318aa1bf15ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:39 +0200 Subject: genirq/timings: Move free timings out of spinlocked region No point to do memory management from a interrupt disabled spin locked region. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Daniel Lezcano Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.196130646@linutronix.de --- kernel/irq/manage.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3e693430bfe1..91e1f2390752 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1489,7 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc->action) { irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); - irq_remove_timings(desc); } #ifdef CONFIG_SMP @@ -1531,8 +1530,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } - if (!desc->action) + if (!desc->action) { irq_release_resources(desc); + irq_remove_timings(desc); + } mutex_unlock(&desc->request_mutex); -- cgit v1.2.3 From 1d0c6e593023ac5dafc2ea2b3f23d96f1c1f2fa2 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Fri, 30 Jun 2017 10:22:14 +0530 Subject: PM / sleep: constify attribute_group structures attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 3802 624 32 4458 116a kernel/power/main.o File size After adding 'const': text data bss dec hex filename 3866 560 32 4458 116a kernel/power/main.o Signed-off-by: Arvind Yadav Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index d401c21136d1..42bd800a6755 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -705,7 +705,7 @@ static struct attribute * g[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -- cgit v1.2.3 From 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:07 +0200 Subject: vtime, sched/cputime: Remove vtime_account_user() It's an unnecessary function between vtime_user_exit() and account_user_time(). Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 84a419bdf5aa..5adc896d0f64 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -724,21 +724,21 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_account_user(struct task_struct *tsk) +void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); + __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_user_enter(struct task_struct *tsk) +void vtime_user_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); + tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) - __vtime_account_system(tsk); - tsk->vtime_snap_whence = VTIME_USER; + account_user_time(tsk, get_vtime_delta(tsk)); write_seqcount_end(&tsk->vtime_seqcount); } -- cgit v1.2.3 From 9fa57cf5a5c4aed1e45879b335fe433048709327 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:08 +0200 Subject: sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime Even though it doesn't have functional consequences, setting the task's new context state after we actually accounted the pending vtime from the old context state makes more sense from a review perspective. vtime_user_exit() is the only function that doesn't follow that rule and that can bug the reviewer for a little while until he realizes there is no reason for this special case. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5adc896d0f64..ab68927e8e94 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -736,9 +736,9 @@ void vtime_user_enter(struct task_struct *tsk) void vtime_user_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) account_user_time(tsk, get_vtime_delta(tsk)); + tsk->vtime_snap_whence = VTIME_SYS; write_seqcount_end(&tsk->vtime_seqcount); } -- cgit v1.2.3 From 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:09 +0200 Subject: sched/cputime: Rename vtime fields The current "snapshot" based naming on vtime fields suggests we record some past event but that's a low level picture of their actual purpose which comes out blurry. The real point of these fields is to run a basic state machine that tracks down cputime entry while switching between contexts. So lets reflect that with more meaningful names. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ++-- kernel/sched/cputime.c | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..83c4f9bf3e14 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1638,8 +1638,8 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_init(&p->vtime_seqcount); - p->vtime_snap = 0; - p->vtime_snap_whence = VTIME_INACTIVE; + p->vtime_starttime = 0; + p->vtime_state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ab68927e8e94..8c64753067c5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,10 +683,10 @@ static u64 vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - if (time_before(now, (unsigned long)tsk->vtime_snap)) + if (time_before(now, (unsigned long)tsk->vtime_starttime)) return 0; - return jiffies_to_nsecs(now - tsk->vtime_snap); + return jiffies_to_nsecs(now - tsk->vtime_starttime); } static u64 get_vtime_delta(struct task_struct *tsk) @@ -701,10 +701,10 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_snap); + delta = jiffies_to_nsecs(now - tsk->vtime_starttime); other = account_other_time(delta); - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap = now; + WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE); + tsk->vtime_starttime = now; return delta - other; } @@ -746,7 +746,7 @@ void vtime_guest_enter(struct task_struct *tsk) { /* * The flags must be updated under the lock with - * the vtime_snap flush and update. + * the vtime_starttime flush and update. * That enforces a right ordering and update sequence * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. @@ -776,12 +776,12 @@ void vtime_account_idle(struct task_struct *tsk) void arch_vtime_task_switch(struct task_struct *prev) { write_seqcount_begin(&prev->vtime_seqcount); - prev->vtime_snap_whence = VTIME_INACTIVE; + prev->vtime_state = VTIME_INACTIVE; write_seqcount_end(&prev->vtime_seqcount); write_seqcount_begin(¤t->vtime_seqcount); - current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = jiffies; + current->vtime_state = VTIME_SYS; + current->vtime_starttime = jiffies; write_seqcount_end(¤t->vtime_seqcount); } @@ -791,8 +791,8 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&t->vtime_seqcount); - t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = jiffies; + t->vtime_state = VTIME_SYS; + t->vtime_starttime = jiffies; write_seqcount_end(&t->vtime_seqcount); local_irq_restore(flags); } @@ -809,7 +809,7 @@ u64 task_gtime(struct task_struct *t) seq = read_seqcount_begin(&t->vtime_seqcount); gtime = t->gtime; - if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) + if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU) gtime += vtime_delta(t); } while (read_seqcount_retry(&t->vtime_seqcount, seq)); @@ -840,7 +840,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) + if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t)) continue; delta = vtime_delta(t); @@ -849,9 +849,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) + if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU) *utime += delta; - else if (t->vtime_snap_whence == VTIME_SYS) + else if (t->vtime_state == VTIME_SYS) *stime += delta; } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } -- cgit v1.2.3 From bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:10 +0200 Subject: sched/cputime: Move the vtime task fields to their own struct We are about to add vtime accumulation fields to the task struct. Let's avoid more bloatification and gather vtime information to their own struct. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 6 +-- kernel/sched/cputime.c | 112 ++++++++++++++++++++++++++++--------------------- 2 files changed, 67 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 83c4f9bf3e14..d927ec11aa7a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process( prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqcount_init(&p->vtime_seqcount); - p->vtime_starttime = 0; - p->vtime_state = VTIME_INACTIVE; + seqcount_init(&p->vtime.seqcount); + p->vtime.starttime = 0; + p->vtime.state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8c64753067c5..9ee725edcbe0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -679,17 +679,17 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static u64 vtime_delta(struct task_struct *tsk) +static u64 vtime_delta(struct vtime *vtime) { unsigned long now = READ_ONCE(jiffies); - if (time_before(now, (unsigned long)tsk->vtime_starttime)) + if (time_before(now, (unsigned long)vtime->starttime)) return 0; - return jiffies_to_nsecs(now - tsk->vtime_starttime); + return jiffies_to_nsecs(now - vtime->starttime); } -static u64 get_vtime_delta(struct task_struct *tsk) +static u64 get_vtime_delta(struct vtime *vtime) { unsigned long now = READ_ONCE(jiffies); u64 delta, other; @@ -701,49 +701,56 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_starttime); + delta = jiffies_to_nsecs(now - vtime->starttime); other = account_other_time(delta); - WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE); - tsk->vtime_starttime = now; + WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); + vtime->starttime = now; return delta - other; } static void __vtime_account_system(struct task_struct *tsk) { - account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); + account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); } void vtime_account_system(struct task_struct *tsk) { - if (!vtime_delta(tsk)) + struct vtime *vtime = &tsk->vtime; + + if (!vtime_delta(vtime)) return; - write_seqcount_begin(&tsk->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk); - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } void vtime_user_enter(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) __vtime_account_system(tsk); - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); + vtime->state = VTIME_USER; + write_seqcount_end(&vtime->seqcount); } void vtime_user_exit(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); - tsk->vtime_snap_whence = VTIME_SYS; - write_seqcount_end(&tsk->vtime_seqcount); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) + account_user_time(tsk, get_vtime_delta(vtime)); + vtime->state = VTIME_SYS; + write_seqcount_end(&vtime->seqcount); } void vtime_guest_enter(struct task_struct *tsk) { + struct vtime *vtime = &tsk->vtime; /* * The flags must be updated under the lock with * the vtime_starttime flush and update. @@ -751,54 +758,62 @@ void vtime_guest_enter(struct task_struct *tsk) * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) __vtime_account_system(tsk); current->flags |= PF_VCPU; - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk); current->flags &= ~PF_VCPU; - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { - account_idle_time(get_vtime_delta(tsk)); + account_idle_time(get_vtime_delta(&tsk->vtime)); } void arch_vtime_task_switch(struct task_struct *prev) { - write_seqcount_begin(&prev->vtime_seqcount); - prev->vtime_state = VTIME_INACTIVE; - write_seqcount_end(&prev->vtime_seqcount); + struct vtime *vtime = &prev->vtime; - write_seqcount_begin(¤t->vtime_seqcount); - current->vtime_state = VTIME_SYS; - current->vtime_starttime = jiffies; - write_seqcount_end(¤t->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_INACTIVE; + write_seqcount_end(&vtime->seqcount); + + vtime = ¤t->vtime; + + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; + vtime->starttime = jiffies; + write_seqcount_end(&vtime->seqcount); } void vtime_init_idle(struct task_struct *t, int cpu) { + struct vtime *vtime = &t->vtime; unsigned long flags; local_irq_save(flags); - write_seqcount_begin(&t->vtime_seqcount); - t->vtime_state = VTIME_SYS; - t->vtime_starttime = jiffies; - write_seqcount_end(&t->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; + vtime->starttime = jiffies; + write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } u64 task_gtime(struct task_struct *t) { + struct vtime *vtime = &t->vtime; unsigned int seq; u64 gtime; @@ -806,13 +821,13 @@ u64 task_gtime(struct task_struct *t) return t->gtime; do { - seq = read_seqcount_begin(&t->vtime_seqcount); + seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU) - gtime += vtime_delta(t); + if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + gtime += vtime_delta(vtime); - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + } while (read_seqcount_retry(&vtime->seqcount, seq)); return gtime; } @@ -824,8 +839,9 @@ u64 task_gtime(struct task_struct *t) */ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { - u64 delta; + struct vtime *vtime = &t->vtime; unsigned int seq; + u64 delta; if (!vtime_accounting_enabled()) { *utime = t->utime; @@ -834,25 +850,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } do { - seq = read_seqcount_begin(&t->vtime_seqcount); + seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t)) + if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) continue; - delta = vtime_delta(t); + delta = vtime_delta(vtime); /* * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU) + if (vtime->state == VTIME_USER || t->flags & PF_VCPU) *utime += delta; - else if (t->vtime_state == VTIME_SYS) + else if (vtime->state == VTIME_SYS) *stime += delta; - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + } while (read_seqcount_retry(&vtime->seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3 From 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 29 Jun 2017 19:15:11 +0200 Subject: sched/cputime: Accumulate vtime on top of nsec clocksource Currently the cputime source used by vtime is jiffies. When we cross a context boundary and jiffies have changed since the last snapshot, the pending cputime is accounted to the switching out context. This system works ok if the ticks are not aligned across CPUs. If they instead are aligned (ie: all fire at the same time) and the CPUs run in userspace, the jiffies change is only observed on tick exit and therefore the user cputime is accounted as system cputime. This is because the CPU that maintains timekeeping fires its tick at the same time as the others. It updates jiffies in the middle of the tick and the other CPUs see that update on IRQ exit: CPU 0 (timekeeper) CPU 1 ------------------- ------------- jiffies = N ... run in userspace for a jiffy tick entry tick entry (sees jiffies = N) set jiffies = N + 1 tick exit tick exit (sees jiffies = N + 1) account 1 jiffy as stime Fix this with using a nanosec clock source instead of jiffies. The cputime is then accumulated and flushed everytime the pending delta reaches a jiffy in order to mitigate the accounting overhead. [ fweisbec: changelog, rebase on struct vtime, field renames, add delta on cputime readers, keep idle vtime as-is (low overhead accounting), harmonize clock sources. ] Suggested-by: Thomas Gleixner Reported-by: Luiz Capitulino Tested-by: Luiz Capitulino Signed-off-by: Wanpeng Li Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 64 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9ee725edcbe0..6e3ea4ac1bda 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static u64 vtime_delta(struct vtime *vtime) { - unsigned long now = READ_ONCE(jiffies); + unsigned long long clock; - if (time_before(now, (unsigned long)vtime->starttime)) + clock = sched_clock_cpu(smp_processor_id()); + if (clock < vtime->starttime) return 0; - return jiffies_to_nsecs(now - vtime->starttime); + return clock - vtime->starttime; } static u64 get_vtime_delta(struct vtime *vtime) { - unsigned long now = READ_ONCE(jiffies); - u64 delta, other; + u64 delta = vtime_delta(vtime); + u64 other; /* * Unlike tick based timing, vtime based timing never has lost @@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - vtime->starttime); other = account_other_time(delta); WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); - vtime->starttime = now; + vtime->starttime += delta; return delta - other; } -static void __vtime_account_system(struct task_struct *tsk) +static void __vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) { - account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); + vtime->stime += get_vtime_delta(vtime); + if (vtime->stime >= TICK_NSEC) { + account_system_time(tsk, irq_count(), vtime->stime); + vtime->stime = 0; + } +} + +static void vtime_account_guest(struct task_struct *tsk, + struct vtime *vtime) +{ + vtime->gtime += get_vtime_delta(vtime); + if (vtime->gtime >= TICK_NSEC) { + account_guest_time(tsk, vtime->gtime); + vtime->gtime = 0; + } } void vtime_account_system(struct task_struct *tsk) @@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk) return; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk); + /* We might have scheduled out from guest path */ + if (current->flags & PF_VCPU) + vtime_account_guest(tsk, vtime); + else + __vtime_account_system(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - __vtime_account_system(tsk); + __vtime_account_system(tsk, vtime); vtime->state = VTIME_USER; write_seqcount_end(&vtime->seqcount); } @@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - account_user_time(tsk, get_vtime_delta(vtime)); + vtime->utime += get_vtime_delta(vtime); + if (vtime->utime >= TICK_NSEC) { + account_user_time(tsk, vtime->utime); + vtime->utime = 0; + } vtime->state = VTIME_SYS; write_seqcount_end(&vtime->seqcount); } @@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - __vtime_account_system(tsk); + __vtime_account_system(tsk, vtime); current->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } @@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk); + vtime_account_guest(tsk, vtime); current->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } @@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = jiffies; + vtime->starttime = sched_clock_cpu(smp_processor_id()); write_seqcount_end(&vtime->seqcount); } @@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = jiffies; + vtime->starttime = sched_clock_cpu(cpu); write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } @@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t) gtime = t->gtime; if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) - gtime += vtime_delta(vtime); + gtime += vtime->gtime + vtime_delta(vtime); } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) * the right place. */ if (vtime->state == VTIME_USER || t->flags & PF_VCPU) - *utime += delta; + *utime += vtime->utime + delta; else if (vtime->state == VTIME_SYS) - *stime += delta; + *stime += vtime->stime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3 From a0c4acd2c220376b4e9690e75782d0c0afdaab9f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 16 Jun 2017 16:44:34 +0300 Subject: locking/rwsem-spinlock: Fix EINTR branch in __down_write_common() If a writer could been woken up, the above branch if (sem->count == 0) break; would have moved us to taking the sem. So, it's not the time to wake a writer now, and only readers are allowed now. Thus, 0 must be passed to __rwsem_do_wake(). Next, __rwsem_do_wake() wakes readers unconditionally. But we mustn't do that if the sem is owned by writer in the moment. Otherwise, writer and reader own the sem the same time, which leads to memory corruption in callers. rwsem-xadd.c does not need that, as: 1) the similar check is made lockless there, 2) in __rwsem_mark_wake::try_reader_grant we test, that sem is not owned by writer. Signed-off-by: Kirill Tkhai Acked-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Niklas Cassel Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 17fcbd590d0c "locking/rwsem: Fix down_write_killable() for CONFIG_RWSEM_GENERIC_SPINLOCK=y" Link: http://lkml.kernel.org/r/149762063282.19811.9129615532201147826.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-spinlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index c65f7989f850..20819df98125 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) out_nolock: list_del(&waiter.list); - if (!list_empty(&sem->wait_list)) - __rwsem_do_wake(sem, 1); + if (!list_empty(&sem->wait_list) && sem->count >= 0) + __rwsem_do_wake(sem, 0); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return -EINTR; -- cgit v1.2.3 From 69d71879d2cf67a381055f698a1d7def00dc4ed7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Jul 2017 09:45:43 -0400 Subject: ftrace: Test for NULL iter->tr in regex for stack_trace_filter changes As writing into stack_trace_filter, the iter-tr is not set and is NULL. Check if it is NULL before dereferencing it in ftrace_regex_release(). Fixes: 8c08f0d5c6fb ("ftrace: Have cached module filters be an active filter") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f8c18f15b190..2953d558bbee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5043,7 +5043,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - if (!list_empty(&iter->tr->mod_trace)) + if (iter->tr && !list_empty(&iter->tr->mod_trace)) iter->hash->flags |= FTRACE_HASH_FL_MOD; } else orig_hash = &iter->ops->func_hash->notrace_hash; -- cgit v1.2.3 From 65a4433aebe36c8c6abeb69b99ef00274b971c6c Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 7 Jun 2017 13:18:57 -0600 Subject: sched/fair: Fix load_balance() affinity redo path If load_balance() fails to migrate any tasks because all tasks were affined, load_balance() removes the source CPU from consideration and attempts to redo and balance among the new subset of CPUs. There is a bug in this code path where the algorithm considers all active CPUs in the system (minus the source that was just masked out). This is not valid for two reasons: some active CPUs may not be in the current scheduling domain and one of the active CPUs is dst_cpu. These CPUs should not be considered, as we cannot pull load from them. Instead of failing out of load_balance(), we may end up redoing the search with no valid CPUs and incorrectly concluding the domain is balanced. Additionally, if the group_imbalance flag was just set, it may also be incorrectly unset, thus the flag will not be seen by other CPUs in future load_balance() runs as that algorithm intends. Fix the check by removing CPUs not in the current domain and the dst_cpu from considertation, thus limiting the evaluation to valid remaining CPUs from which load might be migrated. Co-authored-by: Austin Christ Co-authored-by: Dietmar Eggemann Tested-by: Tyler Baicar Signed-off-by: Jeffrey Hugo Acked-by: Peter Zijlstra Cc: Austin Christ Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Timur Tabi Link: http://lkml.kernel.org/r/1496863138-11322-2-git-send-email-jhugo@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 008c514dc241..c95880e216f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Also avoid computing new_dst_cpu if we have already computed - * one in current iteration. + * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have + * already computed one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ @@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .tasks = LIST_HEAD_INIT(env.tasks), }; - /* - * For NEWLY_IDLE load_balancing, we don't need to consider - * other cpus in our group - */ - if (idle == CPU_NEWLY_IDLE) - env.dst_grpmask = NULL; - - cpumask_copy(cpus, cpu_active_mask); + cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); schedstat_inc(sd->lb_count[idle]); @@ -8151,7 +8144,15 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) { + /* + * Attempting to continue load balancing at the current + * sched_domain level only makes sense if there are + * active CPUs remaining as possible busiest CPUs to + * pull load from which are not contained within the + * destination group that is receiving any migrated + * load. + */ + if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; @@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, + /* + * can_migrate_task() doesn't need to compute new_dst_cpu + * for active balancing. Since we have CPU_IDLE, but no + * @dst_grpmask we need to make that test go away with lying + * about DST_PINNED. + */ + .flags = LBF_DST_PINNED, }; schedstat_inc(sd->alb_count); -- cgit v1.2.3 From 4cc7c1864bbd4cf80f6bdc8ba3217de5aa5f4688 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Jul 2017 16:24:49 +0100 Subject: bpf: Implement show_options Implement the show_options superblock op for bpf as part of a bid to get rid of s_options and generic_show_options() to make it easier to implement a context-based mount where the mount options can be passed individually over a file descriptor. Signed-off-by: David Howells cc: Alexei Starovoitov cc: Daniel Borkmann cc: netdev@vger.kernel.org Signed-off-by: Al Viro --- kernel/bpf/inode.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9bbd33497d3d..e833ed914358 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode) bpf_any_put(inode->i_private, type); } +/* + * Display the mount options in /proc/mounts. + */ +static int bpf_show_options(struct seq_file *m, struct dentry *root) +{ + umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + + if (mode != S_IRWXUGO) + seq_printf(m, ",mode=%o", mode); + return 0; +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, - .show_options = generic_show_options, + .show_options = bpf_show_options, .evict_inode = bpf_evict_inode, }; @@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; int ret; - save_mount_options(sb, data); - ret = bpf_parse_options(data, &opts); if (ret) return ret; -- cgit v1.2.3 From 9cd4f1a4e7a858849e889a081a99adff83e08e4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Jul 2017 22:20:23 +0200 Subject: smp/hotplug: Move unparking of percpu threads to the control CPU Vikram reported the following backtrace: BUG: scheduling while atomic: swapper/7/0/0x00000002 CPU: 7 PID: 0 Comm: swapper/7 Not tainted 4.9.32-perf+ #680 schedule schedule_hrtimeout_range_clock schedule_hrtimeout wait_task_inactive __kthread_bind_mask __kthread_bind __kthread_unpark kthread_unpark cpuhp_online_idle cpu_startup_entry secondary_start_kernel He analyzed correctly that a parked cpu hotplug thread of an offlined CPU was still on the runqueue when the CPU came back online and tried to unpark it. This causes the thread which invoked kthread_unpark() to call wait_task_inactive() and subsequently schedule() with preemption disabled. His proposed workaround was to "make sure" that a parked thread has scheduled out when the CPU goes offline, so the situation cannot happen. But that's still wrong because the root cause is not the fact that the percpu thread is still on the runqueue and neither that preemption is disabled, which could be simply solved by enabling preemption before calling kthread_unpark(). The real issue is that the calling thread is the idle task of the upcoming CPU, which is not supposed to call anything which might sleep. The moron, who wrote that code, missed completely that kthread_unpark() might end up in schedule(). The solution is simpler than expected. The thread which controls the hotplug operation is waiting for the CPU to call complete() on the hotplug state completion. So the idle task of the upcoming CPU can set its state to CPUHP_AP_ONLINE_IDLE and invoke complete(). This in turn wakes the control task on a different CPU, which then can safely do the unpark and kick the now unparked hotplug thread of the upcoming CPU to complete the bringup to the final target state. Control CPU AP bringup_cpu(); __cpu_up() ------------> bringup_ap(); bringup_wait_for_ap() wait_for_completion(); cpuhp_online_idle(); <------------ complete(); unpark(AP->stopper); unpark(AP->hotplugthread); while(1) do_idle(); kick(AP->hotplugthread); wait_for_completion(); hotplug_thread() run_online_callbacks(); complete(); Fixes: 8df3e07e7f21 ("cpu/hotplug: Let upcoming cpu bring itself fully up") Reported-by: Vikram Mulukutla Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Sebastian Sewior Cc: Rusty Russell Cc: Tejun Heo Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707042218020.2131@nanos Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index b03a32595cfe..ab860453841d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -271,11 +271,25 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); + static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_completion(&st->done); + BUG_ON(!cpu_online(cpu)); + + /* Unpark the stopper thread and the hotplug thread of the target cpu */ + stop_machine_unpark(cpu); + kthread_unpark(st->thread); + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) { + __cpuhp_kick_ap_work(st); + wait_for_completion(&st->done); + } return st->result; } @@ -296,9 +310,7 @@ static int bringup_cpu(unsigned int cpu) irq_unlock_sparse(); if (ret) return ret; - ret = bringup_wait_for_ap(cpu); - BUG_ON(!cpu_online(cpu)); - return ret; + return bringup_wait_for_ap(cpu); } /* @@ -767,31 +779,20 @@ void notify_cpu_starting(unsigned int cpu) } /* - * Called from the idle task. We need to set active here, so we can kick off - * the stopper thread and unpark the smpboot threads. If the target state is - * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the - * cpu further. + * Called from the idle task. Wake up the controlling task which brings the + * stopper and the hotplug thread of the upcoming CPU up and then delegates + * the rest of the online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - unsigned int cpu = smp_processor_id(); /* Happens for the boot cpu */ if (state != CPUHP_AP_ONLINE_IDLE) return; st->state = CPUHP_AP_ONLINE_IDLE; - - /* Unpark the stopper thread and the hotplug thread of this cpu */ - stop_machine_unpark(cpu); - kthread_unpark(st->thread); - - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) - __cpuhp_kick_ap_work(st); - else - complete(&st->done); + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ -- cgit v1.2.3 From 99c621d704cf1c4eb74c3c42e674edf3df64f92d Mon Sep 17 00:00:00 2001 From: Michael Sartain Date: Wed, 5 Jul 2017 22:07:15 -0600 Subject: tracing: Add saved_tgids file to show cached pid to tgid mappings Export the cached pid / tgid mappings in debugfs tracing saved_tgids file. This allows user apps to translate the pids from a trace to their respective thread group. Example saved_tgids file with pid / tgid values separated by ' ': # cat saved_tgids 1048 1048 1047 1047 7 7 1049 1047 1054 1047 1053 1047 Link: http://lkml.kernel.org/r/20170630004023.064965233@goodmis.org Link: http://lkml.kernel.org/r/20170706040713.unwkumbta5menygi@mikesart-cos Reviewed-by: Joel Fernandes Signed-off-by: Michael Sartain Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 00e2e4169b1e..f079a8ca1117 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4688,6 +4688,76 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { + if (trace_find_tgid(*ptr)) + return ptr; + } + + return NULL; +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + if (!tgid_map) + return NULL; + + v = &tgid_map[0]; + while (l <= *pos) { + v = saved_tgids_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int pid = (int *)v - tgid_map; + + seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; @@ -7920,6 +7990,9 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); + trace_create_file("saved_tgids", 0444, d_tracer, + NULL, &tracing_saved_tgids_fops); + trace_eval_init(); trace_create_eval_file(d_tracer); -- cgit v1.2.3 From c80081b9209713e0fe86d3def395a9fc66503c58 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 6 Jul 2017 14:29:04 +0200 Subject: genirq: Allow to pass the IRQF_TIMER flag with percpu irq request The irq timings infrastructure tracks when interrupts occur in order to statistically predict te next interrupt event. There is no point to track timer interrupts and try to predict them because the next expiration time is already known. This can be avoided via the IRQF_TIMER flag which is passed by timer drivers in request_irq(). It marks the interrupt as timer based which alloes to ignore these interrupts in the timings code. Per CPU interrupts which are requested via request_percpu_+irq() have no flag argument, so marking per cpu timer interrupts is not possible and they get tracked pointlessly. Add __request_percpu_irq() as a variant of request_percpu_irq() with a flags argument and make request_percpu_irq() an inline wrapper passing flags = 0. The flag parameter is restricted to IRQF_TIMER as all other IRQF_ flags make no sense for per cpu interrupts. The next step is to convert all existing users of request_percpu_irq() and then remove the wrapper and the underscores. [ tglx: Massaged changelog ] Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: nicolas.pitre@linaro.org Cc: vincent.guittot@linaro.org Cc: rafael@kernel.org Link: http://lkml.kernel.org/r/1499344144-3964-1-git-send-email-daniel.lezcano@linaro.org --- kernel/irq/manage.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 91e1f2390752..5624b2dd6b58 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1950,9 +1950,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) } /** - * request_percpu_irq - allocate a percpu interrupt line + * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. + * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * @@ -1965,8 +1966,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) * the handler gets called with the interrupted CPU's instance of * that variable. */ -int request_percpu_irq(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev_id) +int __request_percpu_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *devname, + void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -1980,12 +1982,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; + if (flags && flags != IRQF_TIMER) + return -EINVAL; + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; + action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; @@ -2004,7 +2009,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } -EXPORT_SYMBOL_GPL(request_percpu_irq); +EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. -- cgit v1.2.3 From c0d80ddab89916273cb97114889d3f337bc370ae Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Thu, 6 Jul 2017 15:35:31 -0700 Subject: kernel/extable.c: mark core_kernel_text notrace core_kernel_text is used by MIPS in its function graph trace processing, so having this method traced leads to an infinite set of recursive calls such as: Call Trace: ftrace_return_to_handler+0x50/0x128 core_kernel_text+0x10/0x1b8 prepare_ftrace_return+0x6c/0x114 ftrace_graph_caller+0x20/0x44 return_to_handler+0x10/0x30 return_to_handler+0x0/0x30 return_to_handler+0x0/0x30 ftrace_ops_no_ops+0x114/0x1bc core_kernel_text+0x10/0x1b8 core_kernel_text+0x10/0x1b8 core_kernel_text+0x10/0x1b8 ftrace_ops_no_ops+0x114/0x1bc core_kernel_text+0x10/0x1b8 prepare_ftrace_return+0x6c/0x114 ftrace_graph_caller+0x20/0x44 (...) Mark the function notrace to avoid it being traced. Link: http://lkml.kernel.org/r/1498028607-6765-1-git-send-email-marcin.nowakowski@imgtec.com Signed-off-by: Marcin Nowakowski Reviewed-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Meyer Cc: Ingo Molnar Cc: Steven Rostedt Cc: Daniel Borkmann Cc: Paul Gortmaker Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 0fbdd8582f08..223df4a328a4 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) return 0; } -int core_kernel_text(unsigned long addr) +int notrace core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)_etext) -- cgit v1.2.3 From 61f6d09a931c3ab216f43e00505073088d387d05 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:55 -0700 Subject: kernel/power/snapshot.c: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. Link: http://lkml.kernel.org/r/1498717781-29151-2-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Kees Cook Acked-by: Laura Abbott Cc: Daniel Borkmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b7708e319941..222317721c5a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -30,15 +30,13 @@ #include #include #include +#include #include #include #include #include #include -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -#include -#endif #include "power.h" -- cgit v1.2.3 From 563ec5cbc615698239c3a63511b939a7a7e38870 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:58 -0700 Subject: kernel/module.c: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. The usages of set_memory_xx() are still guarded by CONFIG_STRICT_MODULE_RWX. Link: http://lkml.kernel.org/r/1498717781-29151-3-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Kees Cook Acked-by: Laura Abbott Cc: Daniel Borkmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d7eb41d772c4..8f883d86cedc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -49,9 +49,7 @@ #include #include #include -#ifdef CONFIG_STRICT_MODULE_RWX -#include -#endif +#include #include #include #include -- cgit v1.2.3 From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:11 -0700 Subject: mm, memory_hotplug: do not associate hotadded memory to zones until online The current memory hotplug implementation relies on having all the struct pages associate with a zone/node during the physical hotplug phase (arch_add_memory->__add_pages->__add_section->__add_zone). In the vast majority of cases this means that they are added to ZONE_NORMAL. This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory hotadd without sparsemem") and it wasn't a big deal back then because movable onlining didn't exist yet. Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable memory and portion memory") and then things got more complicated. Rather than reconsidering the zone association which was no longer needed (because the memory hotplug already depended on SPARSEMEM) a convoluted semantic of zone shifting has been developed. Only the currently last memblock or the one adjacent to the zone_movable can be onlined movable. This essentially means that the online type changes as the new memblocks are added. Let's simulate memory hot online manually $ echo 0x100000000 > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory32/valid_zones Normal Movable $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal /sys/devices/system/memory/memory34/valid_zones:Normal Movable $ echo online_movable > /sys/devices/system/memory/memory34/state $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Normal This is an awkward semantic because an udev event is sent as soon as the block is onlined and an udev handler might want to online it based on some policy (e.g. association with a node) but it will inherently race with new blocks showing up. This patch changes the physical online phase to not associate pages with any zone at all. All the pages are just marked reserved and wait for the onlining phase to be associated with the zone as per the online request. There are only two requirements - existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap - ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses the latter one is not an inherent requirement and can be changed in the future. It preserves the current behavior and made the code slightly simpler. This is subject to change in future. This means that the same physical online steps as above will lead to the following state: Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Implementation: The current move_pfn_range is reimplemented to check the above requirements (allow_online_pfn_range) and then updates the respective zone (move_pfn_range_to_zone), the pgdat and links all the pages in the pfn range with the zone/node. __add_pages is updated to not require the zone and only initializes sections in the range. This allowed to simplify the arch_add_memory code (s390 could get rid of quite some of code). devm_memremap_pages is the only user of arch_add_memory which relies on the zone association because it only hooks into the memory hotplug only half way. It uses it to associate the new memory with ZONE_DEVICE but doesn't allow it to be {on,off}lined via sysfs. This means that this particular code path has to call move_pfn_range_to_zone explicitly. The original zone shifting code is kept in place and will be removed in the follow up patch for an easier review. Please note that this patch also changes the original behavior when offlining a memory block adjacent to another zone (Normal vs. Movable) used to allow to change its movable type. This will be handled later. [richard.weiyang@gmail.com: simplify zone_intersects()] Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com [richard.weiyang@gmail.com: remove duplicate call for set_page_links] Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com [akpm@linux-foundation.org: remove unused local `i'] Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Wei Yang Tested-by: Dan Williams Tested-by: Reza Arbab Acked-by: Heiko Carstens # For s390 bits Acked-by: Vlastimil Babka Cc: Martin Schwidefsky Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 23a6483c3666..281eb478856a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -359,6 +359,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); + if (!error) + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); mem_hotplug_done(); if (error) goto err_add_memory; -- cgit v1.2.3 From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:21 -0700 Subject: mm, memory_hotplug: replace for_device by want_memblock in arch_add_memory arch_add_memory gets for_device argument which then controls whether we want to create memblocks for created memory sections. Simplify the logic by telling whether we want memblocks directly rather than going through pointless negation. This also makes the api easier to understand because it is clear what we want rather than nothing telling for_device which can mean anything. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 281eb478856a..124bed776532 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -358,7 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, true); + error = arch_add_memory(nid, align_start, align_size, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, -- cgit v1.2.3 From 57ecbd3831ee3ad43914d5c9dddbff7ce30e3d42 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 6 Jul 2017 15:38:32 -0700 Subject: kernel/exit.c: don't include unused userfaultfd_k.h Commit dd0db88d8094 ("userfaultfd: non-cooperative: rollback userfaultfd_exit") removed userfaultfd callback from exit() which makes the include of unnecessary. Link: http://lkml.kernel.org/r/1494930907-3060-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b0cc86a2d00b..2bbc23273e2f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 3d375d78593cd5daeead34ed3279c4ff63dd04f2 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 6 Jul 2017 15:39:11 -0700 Subject: mm: update callers to use HASH_ZERO flag Update dcache, inode, pid, mountpoint, and mount hash tables to use HASH_ZERO, and remove initialization after allocations. In case of places where HASH_EARLY was used such as in __pv_init_lock_hash the zeroed hash table was already assumed, because memblock zeroes the memory. CPU: SPARC M6, Memory: 7T Before fix: Dentry cache hash table entries: 1073741824 Inode-cache hash table entries: 536870912 Mount-cache hash table entries: 16777216 Mountpoint-cache hash table entries: 16777216 ftrace: allocating 20414 entries in 40 pages Total time: 11.798s After fix: Dentry cache hash table entries: 1073741824 Inode-cache hash table entries: 536870912 Mount-cache hash table entries: 16777216 Mountpoint-cache hash table entries: 16777216 ftrace: allocating 20414 entries in 40 pages Total time: 3.198s CPU: Intel Xeon E5-2630, Memory: 2.2T: Before fix: Dentry cache hash table entries: 536870912 Inode-cache hash table entries: 268435456 Mount-cache hash table entries: 8388608 Mountpoint-cache hash table entries: 8388608 CPU: Physical Processor ID: 0 Total time: 3.245s After fix: Dentry cache hash table entries: 536870912 Inode-cache hash table entries: 268435456 Mount-cache hash table entries: 8388608 Mountpoint-cache hash table entries: 8388608 CPU: Physical Processor ID: 0 Total time: 3.244s Link: http://lkml.kernel.org/r/1488432825-92126-4-git-send-email-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Babu Moger Cc: David Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/qspinlock_paravirt.h | 3 ++- kernel/pid.c | 7 ++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e6b2f7ad3e51..4ccfcaae5b89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void) */ pv_lock_hash = alloc_large_system_hash("PV qspinlock", sizeof(struct pv_hash_entry), - pv_hash_size, 0, HASH_EARLY, + pv_hash_size, 0, + HASH_EARLY | HASH_ZERO, &pv_lock_hash_bits, NULL, pv_hash_size, pv_hash_size); } diff --git a/kernel/pid.c b/kernel/pid.c index fd1cde1e4576..731c4e528f4e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int i, pidhash_size; + unsigned int pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL, + HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); pidhash_size = 1U << pidhash_shift; - - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); } void __init pidmap_init(void) -- cgit v1.2.3 From 213980c0f23b6c4932fd5516da7e8443b2a615ea Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:06 -0700 Subject: mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ae643412948a..5fd1bdbaa381 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); + mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; if (need_loop) { -- cgit v1.2.3 From 5f155f27cb7f0670429e2b8bb954094fa4110df9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:09 -0700 Subject: mm, cpuset: always use seqlock when changing task's nodemask When updating task's mems_allowed and rebinding its mempolicy due to cpuset's mems being changed, we currently only take the seqlock for writing when either the task has a mempolicy, or the new mems has no intersection with the old mems. This should be enough to prevent a parallel allocation seeing no available nodes, but the optimization is IMHO unnecessary (cpuset updates should not be frequent), and we still potentially risk issues if the intersection of new and old nodes has limited amount of free/reclaimable memory. Let's just use the seqlock for all tasks. Link: http://lkml.kernel.org/r/20170517081140.30654-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5fd1bdbaa381..ca8376e5008c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1038,38 +1038,25 @@ static void cpuset_post_attach(void) * @tsk: the task to change * @newmems: new nodes that the task will be set * - * In order to avoid seeing no nodes if the old and new nodes are disjoint, - * we structure updates as setting all new allowed nodes, then clearing newly - * disallowed ones. + * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed + * and rebind an eventual tasks' mempolicy. If the task is allocating in + * parallel, it might temporarily see an empty intersection, which results in + * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool need_loop; - task_lock(tsk); - /* - * Determine if a loop is necessary if another thread is doing - * read_mems_allowed_begin(). If at least one node remains unchanged and - * tsk does not have a mempolicy, then an empty nodemask will not be - * possible when mems_allowed is larger than a word. - */ - need_loop = task_has_mempolicy(tsk) || - !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) { - local_irq_disable(); - write_seqcount_begin(&tsk->mems_allowed_seq); - } + local_irq_disable(); + write_seqcount_begin(&tsk->mems_allowed_seq); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; - if (need_loop) { - write_seqcount_end(&tsk->mems_allowed_seq); - local_irq_enable(); - } + write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); task_unlock(tsk); } -- cgit v1.2.3 From ed52be7bfd45533b194b429f43361493d24599a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:49 -0700 Subject: mm: memcontrol: use generic mod_memcg_page_state for kmem pages The kmem-specific functions do the same thing. Switch and drop. Link: http://lkml.kernel.org/r/20170530181724.27197-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Josef Bacik Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..aa01b810c0bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -326,8 +326,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } /* All stack pages belong to the same memcg. */ - memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -338,8 +338,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } -- cgit v1.2.3 From f610c9d68b1a47f539b7764f4b5ce07d32fb9ae1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Jul 2017 08:57:57 +0200 Subject: genirq/debugfs: Remove redundant NULL pointer check debugfs_remove() can be called with a NULL pointer. Fixes: 087cdfb662ae5 ("genirq/debugfs: Add proper debugfs interface") Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 14fe862aa2e3..ed47688b8e79 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1667,8 +1667,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { - if (d->debugfs_file) - debugfs_remove(d->debugfs_file); + debugfs_remove(d->debugfs_file); } void __init irq_domain_debugfs_init(struct dentry *root) -- cgit v1.2.3 From c5c601c4295f89368f4a304cb3ae4aebdf80db22 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 7 Jul 2017 09:39:59 +0100 Subject: irqdomain: Allow ACPI device nodes to be used as irqdomain identifiers A number of irqchip implementations are (ab)using the irqdomain allocator by passing a fwnode that is neither a FWNODE_OF or a FWNODE_IRQCHIP. This is pretty bad, but it also feels pretty crap to force these drivers to allocate their own irqchip_fwid when they already have a proper fwnode. Instead, let's teach the irqdomain allocator about ACPI device nodes, and add some lovely name generation code... Tested on an arm64 D05 system. Reported-and-tested-by: John Garry Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Agustin Vega-Frias Cc: Ma Jun Cc: Hanjun Guo Link: http://lkml.kernel.org/r/20170707083959.10349-1-marc.zyngier@arm.com --- kernel/irq/irqdomain.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ed47688b8e79..f1f251479aa6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,5 +1,6 @@ #define pr_fmt(fmt) "irq: " fmt +#include #include #include #include @@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } +#ifdef CONFIG_ACPI + } else if (is_acpi_device_node(fwnode)) { + struct acpi_buffer buf = { + .length = ACPI_ALLOCATE_BUFFER, + }; + acpi_handle handle; + + handle = acpi_device_handle(to_acpi_device_node(fwnode)); + if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { + domain->name = buf.pointer; + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } + + domain->fwnode = fwnode; +#endif } else if (of_node) { char *name; -- cgit v1.2.3 From eaf260ac04d9b4cf9f458d5c97555bfff2da526e Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:21 -0700 Subject: tracing: Treat recording comm for idle task as a success Currently we stop recording comm for non-idle tasks when switching from/to idle task since we treat that as a record failure. Fix that by treat recording of comm for idle task as a success. Link: http://lkml.kernel.org/r/20170706230023.17942-1-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Reported-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f079a8ca1117..6722d86f2af5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1916,7 +1916,11 @@ static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; /* -- cgit v1.2.3 From bd45d34d25720a820021c8ea45de5cd607eace64 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:22 -0700 Subject: tracing: Treat recording tgid for idle task as a success Currently we stop recording tgid for non-idle tasks when switching from/to idle task since we treat that as a record failure. Fix that by treat recording of tgid for idle task as a success. Link: http://lkml.kernel.org/r/20170706230023.17942-2-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Reported-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6722d86f2af5..aee11e3a394f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2006,7 +2006,11 @@ int trace_find_tgid(int pid) static int trace_save_tgid(struct task_struct *tsk) { - if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) return 0; tgid_map[tsk->pid] = tsk->tgid; -- cgit v1.2.3 From 29b1a8ad7df4528b862a79e3d5fb0936f4d199c7 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:23 -0700 Subject: tracing: Attempt to record other information even if some fail In recent patches where we record comm and tgid at the same time, we skip continuing to record if any fail. Fix that by trying to record as many things as we can even if some couldn't be recorded. If any information isn't recorded, then we don't set trace_taskinfo_save as before. Link: http://lkml.kernel.org/r/20170706230023.17942-3-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aee11e3a394f..92af8fd1429b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2037,11 +2037,20 @@ static bool tracing_record_taskinfo_skip(int flags) */ void tracing_record_taskinfo(struct task_struct *task, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task)) - return; - if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task)) + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); @@ -2058,15 +2067,22 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && - (!trace_save_cmdline(prev) || !trace_save_cmdline(next))) - return; + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - if ((flags & TRACE_RECORD_TGID) && - (!trace_save_tgid(prev) || !trace_save_tgid(next))) + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); -- cgit v1.2.3 From 5671360f29c68d9079914438f6a0109ef62f82a8 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Sat, 8 Jul 2017 04:56:58 +0900 Subject: locking/qspinlock: Explicitly include asm/prefetch.h In architectures that use qspinlock, like x86, prefetch is loaded indirectly via the asm/qspinlock.h include. On other architectures, like OpenRISC, which may want to use asm-generic/qspinlock.h the built will fail without the asm/prefetch.h include. Fix this by including directly. Signed-off-by: Stafford Horne Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170707195658.23840-1-shorne@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b2caec7315af..fd24153e8a48 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 659b957f20c78fd470083c80af5e79eedfb39e5b Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 7 Jul 2017 22:37:24 +0530 Subject: kprobes: Rename [arch_]function_offset_within_entry() to [arch_]kprobe_on_func_entry() Rename function_offset_within_entry() to scope it to kprobe namespace by using kprobe_ prefix, and to also simplify it. Suggested-by: Ingo Molnar Suggested-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3aa6c7e2e4fb6e00f3c24fa306496a66edb558ea.1499443367.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 8 ++++---- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6756d750b31b..a519219169fd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1888,12 +1888,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); -bool __weak arch_function_offset_within_entry(unsigned long offset) +bool __weak arch_kprobe_on_func_entry(unsigned long offset) { return !offset; } -bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) +bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) { kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); @@ -1901,7 +1901,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign return false; if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || - !arch_function_offset_within_entry(offset)) + !arch_kprobe_on_func_entry(offset)) return false; return true; @@ -1914,7 +1914,7 @@ int register_kretprobe(struct kretprobe *rp) int i; void *addr; - if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) + if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) return -EINVAL; if (kretprobe_blacklist_size) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b53c8d369163..2c5221819be5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -720,7 +720,7 @@ static int create_trace_kprobe(int argc, char **argv) return ret; } if (offset && is_return && - !function_offset_within_entry(NULL, symbol, offset)) { + !kprobe_on_func_entry(NULL, symbol, offset)) { pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } -- cgit v1.2.3 From 0f73ff80b751b39ff539a550e65c5bd131ff0316 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 7 Jul 2017 22:37:25 +0530 Subject: kprobes: Simplify register_jprobes() Re-factor jprobe registration functions as the current version is getting too unwieldy. Move the actual jprobe registration to register_jprobe() and re-organize code accordingly. Suggested-by: Ingo Molnar Signed-off-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/089cae4bfe73767f765291ee0e6fb0c3d240e5f1.1499443367.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a519219169fd..db3cd3e60bdd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1771,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry) int register_jprobes(struct jprobe **jps, int num) { - struct jprobe *jp; int ret = 0, i; if (num <= 0) return -EINVAL; + for (i = 0; i < num; i++) { - unsigned long addr, offset; - jp = jps[i]; - addr = arch_deref_entry_point(jp->entry); - - /* Verify probepoint is a function entry point */ - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && - offset == 0) { - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - ret = register_kprobe(&jp->kp); - } else - ret = -EINVAL; + ret = register_jprobe(jps[i]); if (ret < 0) { if (i > 0) @@ -1796,13 +1785,26 @@ int register_jprobes(struct jprobe **jps, int num) break; } } + return ret; } EXPORT_SYMBOL_GPL(register_jprobes); int register_jprobe(struct jprobe *jp) { - return register_jprobes(&jp, 1); + unsigned long addr, offset; + struct kprobe *kp = &jp->kp; + + /* Verify probepoint is a function entry point */ + addr = arch_deref_entry_point(jp->entry); + + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0) { + kp->pre_handler = setjmp_pre_handler; + kp->break_handler = longjmp_break_handler; + return register_kprobe(kp); + } + + return -EINVAL; } EXPORT_SYMBOL_GPL(register_jprobe); -- cgit v1.2.3 From dbf580623d5fee785218d1a47a2bcdf36d85c0e9 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 7 Jul 2017 22:37:26 +0530 Subject: kprobes: Ensure that jprobe probepoints are at function entry Similar to commit 90ec5e89e393c ("kretprobes: Ensure probe location is at function entry"), ensure that the jprobe probepoint is at function entry. Signed-off-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a4525af6c5a42df385efa31251246cf7cca73598.1499443367.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index db3cd3e60bdd..a1606a4224e1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1795,10 +1795,14 @@ int register_jprobe(struct jprobe *jp) unsigned long addr, offset; struct kprobe *kp = &jp->kp; - /* Verify probepoint is a function entry point */ + /* + * Verify probepoint as well as the jprobe handler are + * valid function entry points. + */ addr = arch_deref_entry_point(jp->entry); - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0) { + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 && + kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) { kp->pre_handler = setjmp_pre_handler; kp->break_handler = longjmp_break_handler; return register_kprobe(kp); -- cgit v1.2.3 From 634a81609561f05266e1f625b6f2567c2e0b0419 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Jul 2017 11:26:39 -0400 Subject: fix waitid(2) breakage We lose the distinction between "found a PID" and "nothing, but that's not an error" a bit too early in waitid(). Easily fixed, fortunately... Reported-by: Markus Trippelsdorf Fixes: 67d7ddded322 ("waitid(2): leave copyout of siginfo to syscall itself") Tested-by: Markus Trippelsdorf Signed-off-by: Al Viro --- kernel/exit.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2bbc23273e2f..608c9775a37b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1590,9 +1590,6 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_rusage = ru; ret = do_wait(&wo); - if (ret > 0) - ret = 0; - put_pid(pid); return ret; } @@ -1603,6 +1600,11 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + } if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) @@ -1612,7 +1614,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return err; user_access_begin(); - unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user((short)info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); @@ -1714,6 +1716,11 @@ COMPAT_SYSCALL_DEFINE5(waitid, struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + } if (!err && uru) { /* kernel_waitid() overwrites everything in ru */ @@ -1729,7 +1736,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, return err; user_access_begin(); - unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user((short)info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); -- cgit v1.2.3 From fca18a47cf3eb8425ec19c2dfc374f3d04f5219f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Sat, 8 Jul 2017 00:27:30 +0530 Subject: trace/kprobes: Sanitize derived event names When we derive event names, convert some expected symbols (such as ':' used to specify module:name and '.' present in some symbols) into underscores so that the event name is not rejected. Before this patch: # echo 'p kobject_example:foo_store' > kprobe_events trace_kprobe: Failed to allocate trace_probe.(-22) -sh: write error: Invalid argument After this patch: # echo 'p kobject_example:foo_store' > kprobe_events # cat kprobe_events p:kprobes/p_kobject_example_foo_store_0 kobject_example:foo_store Link: http://lkml.kernel.org/r/66c189e09e71361aba91dd4a5bd146a1b62a7a51.1499453040.git.naveen.n.rao@linux.vnet.ibm.com Acked-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c129fca6ec99..44fd819aa33d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + static int create_trace_kprobe(int argc, char **argv) { /* @@ -740,6 +748,7 @@ static int create_trace_kprobe(int argc, char **argv) else snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); + sanitize_event_name(buf); event = buf; } tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, -- cgit v1.2.3 From 1860033237d4be09c5d7382585f0c7229367a534 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:48:02 -0700 Subject: mm: make PR_SET_THP_DISABLE immediately active PR_SET_THP_DISABLE has a rather subtle semantic. It doesn't affect any existing mapping because it only updated mm->def_flags which is a template for new mappings. The mappings created after prctl(PR_SET_THP_DISABLE) have VM_NOHUGEPAGE flag set. This can be quite surprising for all those applications which do not do prctl(); fork() & exec() and want to control their own THP behavior. Another usecase when the immediate semantic of the prctl might be useful is a combination of pre- and post-copy migration of containers with CRIU. In this case CRIU populates a part of a memory region with data that was saved during the pre-copy stage. Afterwards, the region is registered with userfaultfd and CRIU expects to get page faults for the parts of the region that were not yet populated. However, khugepaged collapses the pages and the expected page faults do not occur. In more general case, the prctl(PR_SET_THP_DISABLE) could be used as a temporary mechanism for enabling/disabling THP process wide. Implementation wise, a new MMF_DISABLE_THP flag is added. This flag is tested when decision whether to use huge pages is taken either during page fault of at the time of THP collapse. It should be noted, that the new implementation makes PR_SET_THP_DISABLE master override to any per-VMA setting, which was not the case previously. Fixes: a0715cc22601 ("mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE") Link: http://lkml.kernel.org/r/1496415802-30944-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: "Kirill A. Shutemov" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 47d901586b4e..73fc0af147d0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2360,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2368,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (down_write_killable(&me->mm->mmap_sem)) return -EINTR; if (arg2) - me->mm->def_flags |= VM_NOHUGEPAGE; + set_bit(MMF_DISABLE_THP, &me->mm->flags); else - me->mm->def_flags &= ~VM_NOHUGEPAGE; + clear_bit(MMF_DISABLE_THP, &me->mm->flags); up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: -- cgit v1.2.3 From 9dcdcea11491f6eee65bd1b352293ca01e4b7997 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 10 Jul 2017 15:51:14 -0700 Subject: kernel/ksysfs.c: constify attribute_group structures. attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 1120 544 16 1680 690 kernel/ksysfs.o File size After adding 'const': text data bss dec hex filename 1160 480 16 1656 678 kernel/ksysfs.o Link: http://lkml.kernel.org/r/aa224b3cc923fdbb3edd0c41b2c639c85408c9e8.1498737347.git.arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Acked-by: Kees Cook Cc: Russell King Cc: Dave Young Cc: Hari Bathini Cc: Petr Tesarik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 23cd70651238..df1a9aa602a0 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = { NULL }; -static struct attribute_group kernel_attr_group = { +static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; -- cgit v1.2.3 From b7b2562f7252878e18de60c24f320052076f9de8 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 10 Jul 2017 15:51:17 -0700 Subject: kernel/groups.c: use sort library function setgroups is not exactly a hot path, so we might as well use the library function instead of open-coding the sorting. Saves ~150 bytes. Link: http://lkml.kernel.org/r/1497301378-22739-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index d09727692a2a..434f6665f187 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info, return 0; } -/* a simple Shell sort */ +static int gid_cmp(const void *_a, const void *_b) +{ + kgid_t a = *(kgid_t *)_a; + kgid_t b = *(kgid_t *)_b; + + return gid_gt(a, b) - gid_lt(a, b); +} + static void groups_sort(struct group_info *group_info) { - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - kgid_t tmp = group_info->gid[right]; - - while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { - group_info->gid[right] = group_info->gid[left]; - right = left; - left -= stride; - } - group_info->gid[right] = tmp; - } - stride /= 3; - } + sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), + gid_cmp, NULL); } /* a simple bsearch */ -- cgit v1.2.3 From 63b23e2cbc8e80de3e40184ecb2c3bfb705776fa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 10 Jul 2017 15:51:20 -0700 Subject: kernel/kallsyms.c: replace all_var with IS_ENABLED(CONFIG_KALLSYMS_ALL) 'all_var' looks like a variable, but is actually a macro. Use IS_ENABLED(CONFIG_KALLSYMS_ALL) for clarification. Link: http://lkml.kernel.org/r/1497577591-3434-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6a3b249a2ae1..127e7cfafa55 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -28,12 +28,6 @@ #include -#ifdef CONFIG_KALLSYMS_ALL -#define all_var 1 -#else -#define all_var 0 -#endif - /* * These will be re-linked against their real values * during the second link stage. @@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr) static int is_ksym_addr(unsigned long addr) { - if (all_var) + if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); @@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr, if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; - else if (all_var) + else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; -- cgit v1.2.3 From a94c33dd1f677d16c4f1a162b4b3e9eba1b07c24 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Mon, 10 Jul 2017 15:51:58 -0700 Subject: lib/extable.c: use bsearch() library function in search_extable() [thomas@m3y3r.de: v3: fix arch specific implementations] Link: http://lkml.kernel.org/r/1497890858.12931.7.camel@m3y3r.de Signed-off-by: Thomas Meyer Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/extable.c | 3 ++- kernel/module.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 223df4a328a4..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + e = search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); if (!e) e = search_module_extables(addr); return e; diff --git a/kernel/module.c b/kernel/module.c index b3dbdde82e80..b0f92a365140 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4196,7 +4196,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) goto out; e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, + mod->num_exentries, addr); out: preempt_enable(); -- cgit v1.2.3 From 4ea77014af0d6205b05503d1c7aac6eace11d473 Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Mon, 10 Jul 2017 15:52:57 -0700 Subject: kernel/signal.c: avoid undefined behaviour in kill_something_info When running kill(72057458746458112, 0) in userspace I hit the following issue. UBSAN: Undefined behaviour in kernel/signal.c:1462:11 negation of -2147483648 cannot be represented in type 'int': CPU: 226 PID: 9849 Comm: test Tainted: G B ---- ------- 3.10.0-327.53.58.70.x86_64_ubsan+ #116 Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, BIOS BLHSV028 11/11/2014 Call Trace: dump_stack+0x19/0x1b ubsan_epilogue+0xd/0x50 __ubsan_handle_negate_overflow+0x109/0x14e SYSC_kill+0x43e/0x4d0 SyS_kill+0xe/0x10 system_call_fastpath+0x16/0x1b Add code to avoid the UBSAN detection. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/1496670008-59084-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhongjiang Cc: Oleg Nesterov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 48a59eefd8ad..caed9133ae52 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1402,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) return ret; } + /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ + if (pid == INT_MIN) + return -ESRCH; + read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, -- cgit v1.2.3 From dd83c161fbcc5d8be637ab159c0de015cbff5ba4 Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Mon, 10 Jul 2017 15:53:01 -0700 Subject: kernel/exit.c: avoid undefined behaviour when calling wait4() wait4(-2147483648, 0x20, 0, 0xdd0000) triggers: UBSAN: Undefined behaviour in kernel/exit.c:1651:9 The related calltrace is as follows: negation of -2147483648 cannot be represented in type 'int': CPU: 9 PID: 16482 Comm: zj Tainted: G B ---- ------- 3.10.0-327.53.58.71.x86_64+ #66 Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285 /BC11BTSA , BIOS CTSAV036 04/27/2011 Call Trace: dump_stack+0x19/0x1b ubsan_epilogue+0xd/0x50 __ubsan_handle_negate_overflow+0x109/0x14e SyS_wait4+0x1cb/0x1e0 system_call_fastpath+0x16/0x1b Exclude the overflow to avoid the UBSAN warning. Link: http://lkml.kernel.org/r/1497264618-20212-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhongjiang Cc: Oleg Nesterov Cc: David Rientjes Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 608c9775a37b..c5548faa9f37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1639,6 +1639,10 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; + /* -INT_MIN is not defined */ + if (upid == INT_MIN) + return -ESRCH; + if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { -- cgit v1.2.3 From 6a8a75f3235724c5941a33e287b2f98966ad14c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Jul 2017 10:56:54 +0200 Subject: Revert "perf/core: Drop kernel samples even though :u is specified" This reverts commit cc1582c231ea041fbc68861dfaf957eaf902b829. This commit introduced a regression that broke rr-project, which uses sampling events to receive a signal on overflow (but does not care about the contents of the sample). These signals are critical to the correct operation of rr. There's been some back and forth about how to fix it - but to not keep applications in limbo queue up a revert. Reported-by: Kyle Huey Acked-by: Kyle Huey Acked-by: Peter Zijlstra Cc: Jin Yao Cc: Vince Weaver Cc: Linus Torvalds Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Stephane Eranian Cc: Namhyung Kim Cc: Jiri Olsa Cc: Link: http://lkml.kernel.org/r/20170628105600.GC5981@leverpostej Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d2c32f98482..9747e422ab20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7308,21 +7308,6 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } -static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) -{ - /* - * Due to interrupt latency (AKA "skid"), we may enter the - * kernel before taking an overflow, even if the PMU is only - * counting user events. - * To avoid leaking information to userspace, we must always - * reject kernel samples when exclude_kernel is set. - */ - if (event->attr.exclude_kernel && !user_mode(regs)) - return false; - - return true; -} - /* * Generic event overflow handling, sampling. */ @@ -7343,12 +7328,6 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); - /* - * For security, drop the skid kernel samples if necessary. - */ - if (!sample_is_allowed(event, regs)) - return ret; - /* * XXX event_limit might not quite work as expected on inherited * events -- cgit v1.2.3 From dea1d0f5f1284e3defee4b8484d9fc230686cd42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Jul 2017 22:06:24 +0200 Subject: smp/hotplug: Replace BUG_ON and react useful The move of the unpark functions to the control thread moved the BUG_ON() there as well. While it made some sense in the idle thread of the upcoming CPU, it's bogus to crash the control thread on the already online CPU, especially as the function has a return value and the callsite is prepared to handle an error return. Replace it with a WARN_ON_ONCE() and return a proper error code. Fixes: 9cd4f1a4e7a8 ("smp/hotplug: Move unparking of percpu threads to the control CPU") Rightfully-ranted-at-by: Linux Torvalds Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index ab860453841d..eee033134262 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -279,7 +279,8 @@ static int bringup_wait_for_ap(unsigned int cpu) /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_completion(&st->done); - BUG_ON(!cpu_online(cpu)); + if (WARN_ON_ONCE((!cpu_online(cpu)))) + return -ECANCELED; /* Unpark the stopper thread and the hotplug thread of the target cpu */ stop_machine_unpark(cpu); -- cgit v1.2.3 From b11fb73743fc406204e0749ead18560aeda8b136 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Jul 2017 15:43:24 -0400 Subject: tracing: Fixup trace file header alignment The addition of TGID to the tracing header added a check to see if TGID shoudl be displayed or not, and updated the header accordingly. Unfortunately, it broke the default header. Also add constant strings to use for spacing. This does remove the visibility of the header a bit, but cuts it down from the extended lines much greater than 80 characters. Before this change: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU#|||| TIMESTAMP FUNCTION # | | | |||| | | swapper/0-1 [000] .... 0.277830: migration_init <-do_one_initcall swapper/0-1 [002] d... 13.861967: Unknown type 1201 swapper/0-1 [002] d..1 13.861970: Unknown type 1202 After this change: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | swapper/0-1 [000] .... 0.278245: migration_init <-do_one_initcall swapper/0-1 [003] d... 13.861189: Unknown type 1201 swapper/0-1 [003] d..1 13.861192: Unknown type 1202 Cc: Joel Fernandes Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 92af8fd1429b..dabd810a10cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3358,14 +3358,23 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - - seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : ""); - seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : ""); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : ""); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : ""); - seq_printf(m, "# %s||| / delay\n", tgid ? " " : ""); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); - seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : ""); + const char tgid_space[] = " "; + const char space[] = " "; + + seq_printf(m, "# %s _-----=> irqs-off\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s / _----=> need-resched\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s||| / delay\n", + tgid ? tgid_space : space); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + tgid ? " TGID " : space); + seq_printf(m, "# | | | %s|||| | |\n", + tgid ? " | " : space); } void -- cgit v1.2.3 From bbd1d27d863d5c0acee65ecd0c2e34035e1df5ea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Jul 2017 19:21:04 -0400 Subject: tracing: Do note expose stack_trace_filter without DYNAMIC_FTRACE The "stack_trace_filter" file only makes sense if DYNAMIC_FTRACE is configured in. If it is not, then the user can not filter any functions. Not only that, the open function causes warnings when DYNAMIC_FTRACE is not set. Link: http://lkml.kernel.org/r/20170710110521.600806-1-arnd@arndb.de Reported-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b4a751e8f9d6..a4df67cbc711 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +#ifdef CONFIG_DYNAMIC_FTRACE + static int stack_trace_filter_open(struct inode *inode, struct file *file) { @@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = { .release = ftrace_regex_release, }; +#endif /* CONFIG_DYNAMIC_FTRACE */ + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -477,8 +481,10 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); +#ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("stack_trace_filter", 0444, d_tracer, &trace_ops, &stack_trace_filter_fops); +#endif if (stack_trace_filter_buf[0]) ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); -- cgit v1.2.3 From 69449bbd65687e8e5fb968a5a0c46089f6af6001 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Jul 2017 10:44:03 +0200 Subject: ftrace: Hide cached module code for !CONFIG_MODULES When modules are disabled, we get a harmless build warning: kernel/trace/ftrace.c:4051:13: error: 'process_cached_mods' defined but not used [-Werror=unused-function] This adds the same #ifdef around the new code that exists around its caller. Link: http://lkml.kernel.org/r/20170710084413.1820568-1-arnd@arndb.de Fixes: d7fbf8df7ca0 ("ftrace: Implement cached modules tracing on module load") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2953d558bbee..4706f0ed193e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3978,6 +3978,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable); +#ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) { @@ -4068,6 +4069,7 @@ static void process_cached_mods(const char *mod_name) kfree(mod); } +#endif /* * We register the module command as a template to show others how -- cgit v1.2.3 From 19d39a3810e7032f311ef83effdac40339b9d022 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Jul 2017 23:41:52 +0200 Subject: genirq: Keep chip buslock across irq_request/release_resources() Moving the irq_request/release_resources() callbacks out of the spinlocked, irq disabled and bus locked region, unearthed an interesting abuse of the irq_bus_lock/irq_bus_sync_unlock() callbacks. The OMAP GPIO driver does merily power management inside of them. The irq_request_resources() callback of this GPIO irqchip calls a function which reads a GPIO register. That read aborts now because the clock of the GPIO block is not magically enabled via the irq_bus_lock() callback. Move the callbacks under the bus lock again to prevent this. In the free_irq() path this requires to drop the bus_lock before calling synchronize_irq() and reaquiring it before calling the irq_release_resources() callback. The bus lock can't be held because: 1) The data which has been changed between bus_lock/un_lock is cached in the irq chip driver private data and needs to go out to the irq chip via the slow bus (usually SPI or I2C) before calling synchronize_irq(). That's the reason why this bus_lock/unlock magic exists in the first place, as you cannot do SPI/I2C transactions while holding desc->lock with interrupts disabled. 2) synchronize_irq() will actually deadlock, if there is a handler on flight. These chips use threaded handlers for obvious reasons, as they allow to do SPI/I2C communication. When the threaded handler returns then bus_lock needs to be taken in irq_finalize_oneshot() as we need to talk to the actual irq chip once more. After that the threaded handler is marked done, which makes synchronize_irq() return. So if we hold bus_lock accross the synchronize_irq() call, the handler cannot mark itself done because it blocks on the bus lock. That in turn makes synchronize_irq() wait forever on the threaded handler to complete.... Add the missing unlock of desc->request_mutex in the error path of __free_irq() and add a bunch of comments to explain the locking and protection rules. Fixes: 46e48e257360 ("genirq: Move irq resource handling out of spinlocked region") Reported-and-tested-by: Sebastian Reichel Reported-and-tested-by: Tony Lindgren Reported-by: Pavel Machek Signed-off-by: Thomas Gleixner Not-longer-ranted-at-by: Linus Torvalds Cc: Linus Walleij Cc: Grygorii Strashko Cc: Marc Zyngier --- kernel/irq/manage.c | 63 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5624b2dd6b58..1d1a5b945ab4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. + * + * Locking rules: + * + * desc->request_mutex Provides serialization against a concurrent free_irq() + * chip_bus_lock Provides serialization for slow bus operations + * desc->lock Provides serialization against hard interrupts + * + * chip_bus_lock and desc->lock are sufficient for all other management and + * interrupt related functions. desc->request_mutex solely serializes + * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) @@ -1167,20 +1177,35 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + /* + * Protects against a concurrent __free_irq() call which might wait + * for synchronize_irq() to complete without holding the optional + * chip bus lock and desc->lock. + */ mutex_lock(&desc->request_mutex); + + /* + * Acquire bus lock as the irq_request_resources() callback below + * might rely on the serialization or the magic power management + * functions which are abusing the irq_bus_lock() callback, + */ + chip_bus_lock(desc); + + /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); - goto out_mutex; + goto out_bus_unlock; } } - chip_bus_lock(desc); - /* * The following block of code has to be executed atomically + * protected against a concurrent interrupt and any of the other + * management calls which are not serialized via + * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; @@ -1286,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) { - irq_release_resources(desc); + if (ret) goto out_unlock; - } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ @@ -1385,12 +1408,10 @@ mismatch: out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); - if (!desc->action) irq_release_resources(desc); - -out_mutex: +out_bus_unlock: + chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: @@ -1472,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); return NULL; } @@ -1498,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif raw_spin_unlock_irqrestore(&desc->lock, flags); + /* + * Drop bus_lock here so the changes which were done in the chip + * callbacks above are synced out to the irq chips which hang + * behind a slow bus (I2C, SPI) before calling synchronize_irq(). + * + * Aside of that the bus_lock can also be taken from the threaded + * handler in irq_finalize_oneshot() which results in a deadlock + * because synchronize_irq() would wait forever for the thread to + * complete, which is blocked on the bus lock. + * + * The still held desc->request_mutex() protects against a + * concurrent request_irq() of this irq so the release of resources + * and timing data is properly serialized. + */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); @@ -1530,8 +1566,15 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + /* Last action releases resources */ if (!desc->action) { + /* + * Reaquire bus lock as irq_release_resources() might + * require it to deallocate resources over the slow bus. + */ + chip_bus_lock(desc); irq_release_resources(desc); + chip_bus_sync_unlock(desc); irq_remove_timings(desc); } -- cgit v1.2.3 From ab2f7cf141aa6734c4ca7525132d8cc236efee77 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 6 Jul 2017 10:53:20 -0700 Subject: cpufreq: schedutil: Fix sugov_start() versus sugov_update_shared() race With a shared policy in place, when one of the CPUs in the policy is hotplugged out and then brought back online, sugov_stop() and sugov_start() are called in order. sugov_stop() removes utilization hooks for each CPU in the policy and does nothing else in the for_each_cpu() loop. sugov_start() on the other hand iterates through the CPUs in the policy and re-initializes the per-cpu structure _and_ adds the utilization hook. This implies that the scheduler is allowed to invoke a CPU's utilization update hook when the rest of the per-cpu structures have yet to be re-inited. Apart from some strange values in tracepoints this doesn't cause a problem, but if we do end up accessing a pointer from the per-cpu sugov_cpu structure somewhere in the sugov_update_shared() path, we will likely see crashes since the memset for another CPU in the policy is free to race with sugov_update_shared from the CPU that is ready to go. So let's fix this now to first init all per-cpu structures, and then add the per-cpu utilization update hooks all at once. Signed-off-by: Vikram Mulukutla Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 076a2e31951c..29a397067ffa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + } + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, policy_is_shared(policy) ? sugov_update_shared : -- cgit v1.2.3 From 44925dfff05fd1a897992d278b15a6b6b55e79a7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jul 2017 10:33:40 +0300 Subject: ftrace: Remove an unneeded NULL check "func" can't be NULL and it doesn't make sense to check because we've already derefenced it. Link: http://lkml.kernel.org/r/20170712073340.4enzeojeoupuds5a@mwanda Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4706f0ed193e..5fb5b40b3ae8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3950,7 +3950,7 @@ static int cache_mod(struct trace_array *tr, continue; /* no func matches all */ - if (!func || strcmp(func, "*") == 0 || + if (strcmp(func, "*") == 0 || (ftrace_mod->func && strcmp(ftrace_mod->func, func) == 0)) { ret = 0; -- cgit v1.2.3 From 2e028c4fe12907f226b8221815f16c2486ad3aa7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jul 2017 10:35:57 +0300 Subject: ftrace: Fix uninitialized variable in match_records() My static checker complains that if "func" is NULL then "clear_filter" is uninitialized. This seems like it could be true, although it's possible something subtle is happening that I haven't seen. kernel/trace/ftrace.c:3844 match_records() error: uninitialized symbol 'clear_filter'. Link: http://lkml.kernel.org/r/20170712073556.h6tkpjcdzjaozozs@mwanda Cc: stable@vger.kernel.org Fixes: f0a3b154bd7 ("ftrace: Clarify code for mod command") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5fb5b40b3ae8..53f6b6401cf0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3816,7 +3816,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) int exclude_mod = 0; int found = 0; int ret; - int clear_filter; + int clear_filter = 0; if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, -- cgit v1.2.3 From 58c7ffc0747a3a9145629d4966291f0586703767 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 12 Jul 2017 04:59:45 +0100 Subject: fix a braino in compat_sys_getrlimit() Reported-and-tested-by: Meelis Roos Fixes: commit d9e968cb9f84 "getrlimit()/setrlimit(): move compat to native" Signed-off-by: Al Viro Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 73fc0af147d0..2855ee73acd0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1362,7 +1362,7 @@ COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, ret = do_prlimit(current, resource, NULL, &r); if (!ret) { - struct rlimit r32; + struct compat_rlimit r32; if (r.rlim_cur > COMPAT_RLIM_INFINITY) r32.rlim_cur = COMPAT_RLIM_INFINITY; else -- cgit v1.2.3 From 112166f88cf83dd11486cf1818672d42b540865b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 12 Jul 2017 14:33:11 -0700 Subject: kernel/fork.c: virtually mapped stacks: do not disable interrupts The reason to disable interrupts seems to be to avoid switching to a different processor while handling per cpu data using individual loads and stores. If we use per cpu RMV primitives we will not have to disable interrupts. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705171055130.5898@east.gentwo.org Signed-off-by: Christoph Lameter Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0f69a3e5281e..d2b9d7c31eaf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) void *stack; int i; - local_irq_disable(); for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s = this_cpu_read(cached_stacks[i]); + struct vm_struct *s; + + s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; - this_cpu_write(cached_stacks[i], NULL); tsk->stack_vm_area = s; - local_irq_enable(); return s->addr; } - local_irq_enable(); stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, @@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { - unsigned long flags; int i; - local_irq_save(flags); for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_read(cached_stacks[i])) + if (this_cpu_cmpxchg(cached_stacks[i], + NULL, tsk->stack_vm_area) != NULL) continue; - this_cpu_write(cached_stacks[i], tsk->stack_vm_area); - local_irq_restore(flags); return; } - local_irq_restore(flags); vfree_atomic(tsk->stack); return; -- cgit v1.2.3 From 203e9e41219b4e7357104e525e91ac609fba2c6c Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:14 -0700 Subject: kexec: move vmcoreinfo out of the kernel's .bss section As Eric said, "what we need to do is move the variable vmcoreinfo_note out of the kernel's .bss section. And modify the code to regenerate and keep this information in something like the control page. Definitely something like this needs a page all to itself, and ideally far away from any other kernel data structures. I clearly was not watching closely the data someone decided to keep this silly thing in the kernel's .bss section." This patch allocates extra pages for these vmcoreinfo_XXX variables, one advantage is that it enhances some safety of vmcoreinfo, because vmcoreinfo now is kept far away from other kernel data structures. Link: http://lkml.kernel.org/r/1493281021-20737-1-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Reviewed-by: Juergen Gross Suggested-by: Eric Biederman Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Hari Bathini Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 26 ++++++++++++++++++++++---- kernel/ksysfs.c | 2 +- 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..2837d6164db8 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,10 +14,10 @@ #include /* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; +u32 *vmcoreinfo_note; /* * parsing the "crashkernel" commandline @@ -326,6 +326,9 @@ static void update_vmcoreinfo_note(void) void crash_save_vmcoreinfo(void) { + if (!vmcoreinfo_note) + return; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } @@ -356,11 +359,26 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) { + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_PAGESIZE(PAGE_SIZE); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index df1a9aa602a0..46ba853656f6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sprintf(buf, "%pa %x\n", &vmcore_base, - (unsigned int)sizeof(vmcoreinfo_note)); + (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); -- cgit v1.2.3 From 5203f4995d9a87952a83c2ce7866adbbe8f97bb5 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:17 -0700 Subject: powerpc/fadump: use the correct VMCOREINFO_NOTE_SIZE for phdr vmcoreinfo_max_size stands for the vmcoreinfo_data, the correct one we should use is vmcoreinfo_note whose total size is VMCOREINFO_NOTE_SIZE. Like explained in commit 77019967f06b ("kdump: fix exported size of vmcoreinfo note"), it should not affect the actual function, but we better fix it, also this change should be safe and backward compatible. After this, we can get rid of variable vmcoreinfo_max_size, let's use the corresponding macros directly, fewer variables means more safety for vmcoreinfo operation. [xlpang@redhat.com: fix build warning] Link: http://lkml.kernel.org/r/1494830606-27736-1-git-send-email-xlpang@redhat.com Link: http://lkml.kernel.org/r/1493281021-20737-2-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Reviewed-by: Mahesh Salgaonkar Reviewed-by: Dave Young Cc: Hari Bathini Cc: Benjamin Herrenschmidt Cc: Eric Biederman Cc: Juergen Gross Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2837d6164db8..315adbf9cb68 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -16,7 +16,6 @@ /* vmcoreinfo stuff */ static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; u32 *vmcoreinfo_note; /* @@ -343,7 +342,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); -- cgit v1.2.3 From 1229384f5b856d83698c38f9dedfd836e26711cb Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:21 -0700 Subject: kdump: protect vmcoreinfo data under the crash memory Currently vmcoreinfo data is updated at boot time subsys_initcall(), it has the risk of being modified by some wrong code during system is running. As a result, vmcore dumped may contain the wrong vmcoreinfo. Later on, when using "crash", "makedumpfile", etc utility to parse this vmcore, we probably will get "Segmentation fault" or other unexpected errors. E.g. 1) wrong code overwrites vmcoreinfo_data; 2) further crashes the system; 3) trigger kdump, then we obviously will fail to recognize the crash context correctly due to the corrupted vmcoreinfo. Now except for vmcoreinfo, all the crash data is well protected(including the cpu note which is fully updated in the crash path, thus its correctness is guaranteed). Given that vmcoreinfo data is a large chunk prepared for kdump, we better protect it as well. To solve this, we relocate and copy vmcoreinfo_data to the crash memory when kdump is loading via kexec syscalls. Because the whole crash memory will be protected by existing arch_kexec_protect_crashkres() mechanism, we naturally protect vmcoreinfo_data from write(even read) access under kernel direct mapping after kdump is loaded. Since kdump is usually loaded at the very early stage after boot, we can trust the correctness of the vmcoreinfo data copied. On the other hand, we still need to operate the vmcoreinfo safe copy when crash happens to generate vmcoreinfo_note again, we rely on vmap() to map out a new kernel virtual address and update to use this new one instead in the following crash_save_vmcoreinfo(). BTW, we do not touch vmcoreinfo_note, because it will be fully updated using the protected vmcoreinfo_data after crash which is surely correct just like the cpu crash note. Link: http://lkml.kernel.org/r/1493281021-20737-3-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Eric Biederman Cc: Hari Bathini Cc: Juergen Gross Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 17 ++++++++++++++++- kernel/kexec.c | 8 ++++++++ kernel/kexec_core.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/kexec_file.c | 8 ++++++++ 4 files changed, 71 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 315adbf9cb68..6db80fc0810b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -15,9 +15,12 @@ /* vmcoreinfo stuff */ static unsigned char *vmcoreinfo_data; -size_t vmcoreinfo_size; +static size_t vmcoreinfo_size; u32 *vmcoreinfo_note; +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; + /* * parsing the "crashkernel" commandline * @@ -323,11 +326,23 @@ static void update_vmcoreinfo_note(void) final_note(buf); } +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + void crash_save_vmcoreinfo(void) { if (!vmcoreinfo_note) return; + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } diff --git a/kernel/kexec.c b/kernel/kexec.c index 980936a90ee6..e62ec4dc6620 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + for (i = 0; i < nr_segments; i++) { ret = kimage_load_segment(image, &image->segment[i]); if (ret) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 154ffb489b93..1ae7c41c33c1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image, return pages; } +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -569,6 +603,11 @@ void kimage_free(struct kimage *image) if (!image) return; + if (image->vmcoreinfo_data_copy) { + crash_update_vmcoreinfo_safecopy(NULL); + vunmap(image->vmcoreinfo_data_copy); + } + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 766e7e4d3ad9..c8f7f77e9fa9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -298,6 +298,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + ret = kexec_calculate_store_digests(image); if (ret) goto out; -- cgit v1.2.3 From a19ac3374995382a994653ff372b98ea7cbad548 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:30 -0700 Subject: sysctl: kdoc'ify sysctl_writes_strict Document the different sysctl_writes_strict modes in code. Link: http://lkml.kernel.org/r/20170519033554.18592-3-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..02725178694a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -174,11 +174,32 @@ extern int no_unaligned_warning; #ifdef CONFIG_PROC_SYSCTL -#define SYSCTL_WRITES_LEGACY -1 -#define SYSCTL_WRITES_WARN 0 -#define SYSCTL_WRITES_STRICT 1 +/** + * enum sysctl_writes_mode - supported sysctl write modes + * + * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. + * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is + * not 0. + * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max lenght will be ignored. Multiple writes will append + * to the buffer. + * + * These write modes control how current file position affects the behavior of + * updating sysctl values through the proc interface on each write. + */ +enum sysctl_writes_mode { + SYSCTL_WRITES_LEGACY = -1, + SYSCTL_WRITES_WARN = 0, + SYSCTL_WRITES_STRICT = 1, +}; -static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; +static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3 From d383d48470819e86fe30eb72f0e9494e1ee0e2af Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:33 -0700 Subject: sysctl: fold sysctl_writes_strict checks into helper The mode sysctl_writes_strict positional checks keep being copy and pasted as we add new proc handlers. Just add a helper to avoid code duplication. Link: http://lkml.kernel.org/r/20170519033554.18592-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Kees Cook Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 56 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 02725178694a..6f3bb1f099fa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1970,6 +1970,32 @@ static void warn_sysctl_write(struct ctl_table *table) current->comm, table->procname); } +/** + * proc_first_pos_non_zero_ignore - check if firs position is allowed + * @ppos: file position + * @table: the sysctl table + * + * Returns true if the first position is non-zero and the sysctl_writes_strict + * mode indicates this is not allowed for numeric input types. String proc + * hadlers can ignore the return value. + */ +static bool proc_first_pos_non_zero_ignore(loff_t *ppos, + struct ctl_table *table) +{ + if (!*ppos) + return false; + + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + return true; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + return false; + default: + return false; + } +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1990,8 +2016,8 @@ static void warn_sysctl_write(struct ctl_table *table) int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) - warn_sysctl_write(table); + if (write) + proc_first_pos_non_zero_ignore(ppos, table); return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); @@ -2193,17 +2219,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; @@ -2468,17 +2485,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; -- cgit v1.2.3 From 4f2fec00afa60aa8e5d1b7f2a8e0526900f55623 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:36 -0700 Subject: sysctl: simplify unsigned int support Commit e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") added proc_douintvec() to start help adding support for unsigned int, this however was only half the work needed. Two fixes have come in since then for the following issues: o Printing the values shows a negative value, this happens since do_proc_dointvec() and this uses proc_put_long() This was fixed by commit 5380e5644afbba9 ("sysctl: don't print negative flag for proc_douintvec"). o We can easily wrap around the int values: UINT_MAX is 4294967295, if we echo in 4294967295 + 1 we end up with 0, using 4294967295 + 2 we end up with 1. o We echo negative values in and they are accepted This was fixed by commit 425fffd886ba ("sysctl: report EINVAL if value is larger than UINT_MAX for proc_douintvec"). It still also failed to be added to sysctl_check_table()... instead of adding it with the current implementation just provide a proper and simplified unsigned int support without any array unsigned int support with no negative support at all. Historically sysctl proc helpers have supported arrays, due to the complexity this adds though we've taken a step back to evaluate array users to determine if its worth upkeeping for unsigned int. An evaluation using Coccinelle has been done to perform a grammatical search to ask ourselves: o How many sysctl proc_dointvec() (int) users exist which likely should be moved over to proc_douintvec() (unsigned int) ? Answer: about 8 - Of these how many are array users ? Answer: Probably only 1 o How many sysctl array users exist ? Answer: about 12 This last question gives us an idea just how popular arrays: they are not. Array support should probably just be kept for strings. The identified uint ports are: drivers/infiniband/core/ucma.c - max_backlog drivers/infiniband/core/iwcm.c - default_backlog net/core/sysctl_net_core.c - rps_sock_flow_sysctl() net/netfilter/nf_conntrack_timestamp.c - nf_conntrack_timestamp -- bool net/netfilter/nf_conntrack_acct.c nf_conntrack_acct -- bool net/netfilter/nf_conntrack_ecache.c - nf_conntrack_events -- bool net/netfilter/nf_conntrack_helper.c - nf_conntrack_helper -- bool net/phonet/sysctl.c proc_local_port_range() The only possible array users is proc_local_port_range() but it does not seem worth it to add array support just for this given the range support works just as well. Unsigned int support should be desirable more for when you *need* more than INT_MAX or using int min/max support then does not suffice for your ranges. If you forget and by mistake happen to register an unsigned int proc entry with an array, the driver will fail and you will get something as follows: sysctl table check failed: debug/test_sysctl//uint_0002 array now allowed CPU: 2 PID: 1342 Comm: modprobe Tainted: G W E Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Call Trace: dump_stack+0x63/0x81 __register_sysctl_table+0x350/0x650 ? kmem_cache_alloc_trace+0x107/0x240 __register_sysctl_paths+0x1b3/0x1e0 ? 0xffffffffc005f000 register_sysctl_table+0x1f/0x30 test_sysctl_init+0x10/0x1000 [test_sysctl] do_one_initcall+0x52/0x1a0 ? kmem_cache_alloc_trace+0x107/0x240 do_init_module+0x5f/0x200 load_module+0x1867/0x1bd0 ? __symbol_put+0x60/0x60 SYSC_finit_module+0xdf/0x110 SyS_finit_module+0xe/0x10 entry_SYSCALL_64_fastpath+0x1e/0xad RIP: 0033:0x7f042b22d119 Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") Link: http://lkml.kernel.org/r/20170519033554.18592-5-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Alexey Dobriyan Cc: Subash Abhinov Kasiviswanathan Cc: Liping Zhang Cc: Alexey Dobriyan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 146 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6f3bb1f099fa..d12078fc215f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2175,19 +2175,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } -static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) +static int do_proc_douintvec_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) { if (write) { - if (*negp) + if (*lvalp > UINT_MAX) return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; - *negp = false; *lvalp = (unsigned long)val; } return 0; @@ -2287,6 +2286,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +static int do_proc_douintvec_w(unsigned int *tbl_data, + struct ctl_table *table, + void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + bool neg; + char *kbuf = NULL, *p; + + left = *lenp; + + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto bail_early; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return -EINVAL; + + left -= proc_skip_spaces(&p); + if (!left) { + err = -EINVAL; + goto out_free; + } + + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err || neg) { + err = -EINVAL; + goto out_free; + } + + if (conv(&lval, tbl_data, 1, data)) { + err = -EINVAL; + goto out_free; + } + + if (!err && left) + left -= proc_skip_spaces(&p); + +out_free: + kfree(kbuf); + if (err) + return -EINVAL; + + return 0; + + /* This is in keeping with old __do_proc_dointvec() */ +bail_early: + *ppos += *lenp; + return err; +} + +static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + + left = *lenp; + + if (conv(&lval, tbl_data, 0, data)) { + err = -EINVAL; + goto out; + } + + err = proc_put_long(&buffer, &left, lval, false); + if (err || !left) + goto out; + + err = proc_put_char(&buffer, &left, '\n'); + +out: + *lenp -= left; + *ppos += *lenp; + + return err; +} + +static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned int *i, vleft; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + + /* + * Arrays are not supported, keep this simple. *Do not* add + * support for them. + */ + if (vleft != 1) { + *lenp = 0; + return -EINVAL; + } + + if (!conv) + conv = do_proc_douintvec_conv; + + if (write) + return do_proc_douintvec_w(i, table, buffer, lenp, ppos, + conv, data); + return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + return __do_proc_douintvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -2322,8 +2461,8 @@ int proc_dointvec(struct ctl_table *table, int write, int proc_douintvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* -- cgit v1.2.3 From 61d9b56a89208d8cccd0b4cfec7e6959717e16e3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:40 -0700 Subject: sysctl: add unsigned int range support To keep parity with regular int interfaces provide the an unsigned int proc_douintvec_minmax() which allows you to specify a range of allowed valid numbers. Adding proc_douintvec_minmax_sysadmin() is easy but we can wait for an actual user for that. Link: http://lkml.kernel.org/r/20170519033554.18592-6-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook Cc: Subash Abhinov Kasiviswanathan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d12078fc215f..df9f2a367882 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2567,6 +2567,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +}; + +static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_douintvec_minmax_conv_param *param = data; + + if (write) { + unsigned int val = *lvalp; + + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +/** + * proc_douintvec_minmax - read a vector of unsigned ints with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). There is a final sanity + * check for UINT_MAX to avoid having to support wrap around uses from + * userspace. + * + * Returns 0 on success. + */ +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_douintvec_minmax_conv_param param = { + .min = (unsigned int *) table->extra1, + .max = (unsigned int *) table->extra2, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3066,6 +3125,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3108,6 +3173,7 @@ EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); -- cgit v1.2.3 From 9380fa60b10ebd6ee7c3fcdb2cf162f4d7cf9fc5 Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 12 Jul 2017 14:34:01 -0700 Subject: kernel/sysctl_binary.c: check name array length in deprecated_sysctl_warning() Prevent use of uninitialized memory (originating from the stack frame of do_sysctl()) by verifying that the name array is filled with sufficient input data before comparing its specific entries with integer constants. Through timing measurement or analyzing the kernel debug logs, a user-mode program could potentially infer the results of comparisons against the uninitialized memory, and acquire some (very limited) information about the state of the kernel stack. The change also eliminates possible future warnings by tools such as KMSAN and other code checkers / instrumentations. Link: http://lkml.kernel.org/r/20170524122139.21333-1-mjurczyk@google.com Signed-off-by: Mateusz Jurczyk Acked-by: Kees Cook Cc: "David S. Miller" Cc: Matthew Whitehead Cc: "Eric W. Biederman" Cc: Tetsuo Handa Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 939a158eab11..02e1859f2ca8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen) * CTL_KERN/KERN_VERSION is used by older glibc and cannot * ever go away. */ - if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) return; if (printk_ratelimit()) { -- cgit v1.2.3 From 0791e3644e5ef21646fe565b9061788d05ec71d4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 12 Jul 2017 14:34:28 -0700 Subject: kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files With current epoll architecture target files are addressed with file_struct and file descriptor number, where the last is not unique. Moreover files can be transferred from another process via unix socket, added into queue and closed then so we won't find this descriptor in the task fdinfo list. Thus to checkpoint and restore such processes CRIU needs to find out where exactly the target file is present to add it into epoll queue. For this sake one can use kcmp call where some particular target file from the queue is compared with arbitrary file passed as an argument. Because epoll target files can have same file descriptor number but different file_struct a caller should explicitly specify the offset within. To test if some particular file is matching entry inside epoll one have to - fill kcmp_epoll_slot structure with epoll file descriptor, target file number and target file offset (in case if only one target is present then it should be 0) - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot) - the kernel fetch file pointer matching file descriptor @fd of pid1 - lookups for file struct in epoll queue of pid2 and returns traditional 0,1,2 result for sorting purpose Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com Signed-off-by: Cyrill Gorcunov Acked-by: Andrey Vagin Cc: Al Viro Cc: Pavel Emelyanov Cc: Michael Kerrisk Cc: Jason Baron Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -11,6 +11,10 @@ #include #include #include +#include +#include +#include +#include #include @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) return err; } +#ifdef CONFIG_EPOLL +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + struct file *filp, *filp_epoll, *filp_tgt; + struct kcmp_epoll_slot slot; + struct files_struct *files; + + if (copy_from_user(&slot, uslot, sizeof(slot))) + return -EFAULT; + + filp = get_file_raw_ptr(task1, idx1); + if (!filp) + return -EBADF; + + files = get_files_struct(task2); + if (!files) + return -EBADF; + + spin_lock(&files->file_lock); + filp_epoll = fcheck_files(files, slot.efd); + if (filp_epoll) + get_file(filp_epoll); + else + filp_tgt = ERR_PTR(-EBADF); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (filp_epoll) { + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); + fput(filp_epoll); + } else + + if (IS_ERR(filp_tgt)) + return PTR_ERR(filp_tgt); + + return kcmp_ptr(filp, filp_tgt, KCMP_FILE); +} +#else +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + return -EOPNOTSUPP; +} +#endif + SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, ret = -EOPNOTSUPP; #endif break; + case KCMP_EPOLL_TFD: + ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); + break; default: ret = -EINVAL; break; -- cgit v1.2.3 From e41d58185f1444368873d4d7422f7664a68be61d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 12 Jul 2017 14:34:35 -0700 Subject: fault-inject: support systematic fault injection Add /proc/self/task//fail-nth file that allows failing 0-th, 1-st, 2-nd and so on calls systematically. Excerpt from the added documentation: "Write to this file of integer N makes N-th call in the current task fail (N is 0-based). Read from this file returns a single char 'Y' or 'N' that says if the fault setup with a previous write to this file was injected or not, and disables the fault if it wasn't yet injected. Note that this file enables all types of faults (slab, futex, etc). This setting takes precedence over all other generic settings like probability, interval, times, etc. But per-capability settings (e.g. fail_futex/ignore-private) take precedence over it. This feature is intended for systematic testing of faults in a single system call. See an example below" Why add a new setting: 1. Existing settings are global rather than per-task. So parallel testing is not possible. 2. attr->interval is close but it depends on attr->count which is non reset to 0, so interval does not work as expected. 3. Trying to model this with existing settings requires manipulations of all of probability, interval, times, space, task-filter and unexposed count and per-task make-it-fail files. 4. Existing settings are per-failure-type, and the set of failure types is potentially expanding. 5. make-it-fail can't be changed by unprivileged user and aggressive stress testing better be done from an unprivileged user. Similarly, this would require opening the debugfs files to the unprivileged user, as he would need to reopen at least times file (not possible to pre-open before dropping privs). The proposed interface solves all of the above (see the example). We want to integrate this into syzkaller fuzzer. A prototype has found 10 bugs in kernel in first day of usage: https://groups.google.com/forum/#!searchin/syzkaller/%22FAULT_INJECTION%22%7Csort:relevance I've made the current interface work with all types of our sandboxes. For setuid the secret sauce was prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) to make /proc entries non-root owned. So I am fine with the current version of the code. [akpm@linux-foundation.org: fix build] Link: http://lkml.kernel.org/r/20170328130128.101773-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Akinobu Mita Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d2b9d7c31eaf..ade237a96308 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -573,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: -- cgit v1.2.3 From f2e0cff85ed111a3cf24d894c3fa11697dfae628 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:43 -0700 Subject: kernel/watchdog: introduce arch_touch_nmi_watchdog() For architectures that define HAVE_NMI_WATCHDOG, instead of having them provide the complete touch_nmi_watchdog() function, just have them provide arch_touch_nmi_watchdog(). This gives the generic code more flexibility in implementing this function, and arch implementations don't miss out on touching the softlockup watchdog or other generic details. Link: http://lkml.kernel.org/r/20170616065715.18390-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog_hld.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 54a427d1f344..90d688df6ce1 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -56,7 +56,7 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have @@ -66,9 +66,8 @@ void touch_nmi_watchdog(void) * going off. */ raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, -- cgit v1.2.3 From 05a4a95279311c3a4633b4277a5d21cfd616c6c7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:46 -0700 Subject: kernel/watchdog: split up config options Split SOFTLOCKUP_DETECTOR from LOCKUP_DETECTOR, and split HARDLOCKUP_DETECTOR_PERF from HARDLOCKUP_DETECTOR. LOCKUP_DETECTOR implies the general boot, sysctl, and programming interfaces for the lockup detectors. An architecture that wants to use a hard lockup detector must define HAVE_HARDLOCKUP_DETECTOR_PERF or HAVE_HARDLOCKUP_DETECTOR_ARCH. Alternatively an arch can define HAVE_NMI_WATCHDOG, which provides the minimum arch_touch_nmi_watchdog, and it otherwise does its own thing and does not implement the LOCKUP_DETECTOR interfaces. sparc is unusual in that it has started to implement some of the interfaces, but not fully yet. It should probably be converted to a full HAVE_HARDLOCKUP_DETECTOR_ARCH. [npiggin@gmail.com: fix] Link: http://lkml.kernel.org/r/20170617223522.66c0ad88@roar.ozlabs.ibm.com Link: http://lkml.kernel.org/r/20170616065715.18390-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- kernel/sysctl.c | 31 ++++--- kernel/watchdog.c | 243 ++++++++++++++++++++++++++++++++------------------ kernel/watchdog_hld.c | 32 ------- 4 files changed, 177 insertions(+), 131 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index df9f2a367882..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -900,6 +900,14 @@ static struct ctl_table kern_table[] = { .extra2 = &zero, #endif }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &soft_watchdog_enabled, @@ -909,13 +917,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, { .procname = "softlockup_panic", .data = &softlockup_panic, @@ -925,27 +926,29 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_SMP { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#endif /* CONFIG_SMP */ #endif -#ifdef CONFIG_SMP +#ifdef CONFIG_HARDLOCKUP_DETECTOR { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, + .procname = "hardlockup_panic", + .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, @@ -957,6 +960,8 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #endif +#endif + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..1fba9c3d66dc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,15 +29,58 @@ #include #include +/* Watchdog configuration */ static DEFINE_MUTEX(watchdog_proc_mutex); -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +int __read_mostly nmi_watchdog_enabled; + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | + NMI_WATCHDOG_ENABLED; #else unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif -int __read_mostly nmi_watchdog_enabled; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR int __read_mostly soft_watchdog_enabled; +#endif + int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #endif -static struct cpumask watchdog_cpumask __read_mostly; +struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - /* * The 'watchdog_running' variable is set to 1 when the watchdog threads * are registered/started and is set to 0 when the watchdog threads are @@ -72,7 +109,27 @@ static int __read_mostly watchdog_running; * of 'watchdog_running' cannot change while the watchdog is deactivated * temporarily (see related code in 'proc' handlers). */ -static int __read_mostly watchdog_suspended; +int __read_mostly watchdog_suspended; + +/* + * These functions can be overridden if an architecture implements its + * own hardlockup detector. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) + +atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); static u64 __read_mostly sample_period; @@ -120,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int __init hardlockup_all_cpu_backtrace_setup(char *str) { sysctl_hardlockup_all_cpu_backtrace = @@ -128,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -213,18 +272,6 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -/* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -237,21 +284,21 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static void watchdog_interrupt_count(void) +/* watchdog detector functions */ +bool is_hardlockup(void) { - __this_cpu_inc(hrtimer_interrupts); -} + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -/* - * These two functions are mostly architecture specific - * defining them as weak here. - */ -int __weak watchdog_nmi_enable(unsigned int cpu) -{ - return 0; + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return true; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; } -void __weak watchdog_nmi_disable(unsigned int cpu) + +static void watchdog_interrupt_count(void) { + __this_cpu_inc(hrtimer_interrupts); } static int watchdog_enable_all_cpus(void); @@ -502,57 +549,6 @@ static void watchdog_unpark_threads(void) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - static int update_watchdog_all_cpus(void) { int ret; @@ -604,6 +600,81 @@ static void watchdog_disable_all_cpus(void) } } +#else /* SOFTLOCKUP */ +static int watchdog_park_threads(void) +{ + return 0; +} + +static void watchdog_unpark_threads(void) +{ +} + +static int watchdog_enable_all_cpus(void) +{ + return 0; +} + +static void watchdog_disable_all_cpus(void) +{ +} + +static void set_sample_period(void) +{ +} +#endif /* SOFTLOCKUP */ + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); + /* + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). + */ + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + mutex_unlock(&watchdog_proc_mutex); + + return ret; +} + +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) +{ + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; + /* + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. + */ + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); +} + #ifdef CONFIG_SYSCTL /* @@ -810,9 +881,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ +#ifdef CONFIG_SOFTLOCKUP_DETECTOR if (smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask) != 0) pr_err("cpumask update failed\n"); +#endif } } out: diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 90d688df6ce1..295a0d84934c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -22,39 +22,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); void arch_touch_nmi_watchdog(void) { -- cgit v1.2.3 From a10a842ff81a7e3810817b3b04e4c432b6191e21 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:49 -0700 Subject: kernel/watchdog: provide watchdog_nmi_reconfigure() for arch watchdogs After reconfiguring watchdog sysctls etc., architecture specific watchdogs may not get all their parameters updated. watchdog_nmi_reconfigure() can be implemented to pull the new values in and set the arch NMI watchdog. [npiggin@gmail.com: add code comments] Link: http://lkml.kernel.org/r/20170617125933.774d3858@roar.ozlabs.ibm.com [arnd@arndb.de: hide unused function] Link: http://lkml.kernel.org/r/20170620204854.966601-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170616065715.18390-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Signed-off-by: Arnd Bergmann Reviewed-by: Don Zickus Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1fba9c3d66dc..cabe3e9fb620 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -114,6 +114,10 @@ int __read_mostly watchdog_suspended; /* * These functions can be overridden if an architecture implements its * own hardlockup detector. + * + * watchdog_nmi_enable/disable can be implemented to start and stop when + * softlockup watchdog threads start and stop. The arch must select the + * SOFTLOCKUP_DETECTOR Kconfig. */ int __weak watchdog_nmi_enable(unsigned int cpu) { @@ -123,6 +127,22 @@ void __weak watchdog_nmi_disable(unsigned int cpu) { } +/* + * watchdog_nmi_reconfigure can be implemented to be notified after any + * watchdog configuration change. The arch hardlockup watchdog should + * respond to the following variables: + * - nmi_watchdog_enabled + * - watchdog_thresh + * - watchdog_cpumask + * - sysctl_hardlockup_all_cpu_backtrace + * - hardlockup_panic + * - watchdog_suspended + */ +void __weak watchdog_nmi_reconfigure(void) +{ +} + + #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* Helper for online, unparked cpus. */ @@ -600,6 +620,14 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask); +} +#endif + #else /* SOFTLOCKUP */ static int watchdog_park_threads(void) { @@ -619,6 +647,13 @@ static void watchdog_disable_all_cpus(void) { } +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return 0; +} +#endif + static void set_sample_period(void) { } @@ -651,6 +686,8 @@ int lockup_detector_suspend(void) watchdog_enabled = 0; } + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); return ret; @@ -671,6 +708,8 @@ void lockup_detector_resume(void) if (watchdog_running && !watchdog_suspended) watchdog_unpark_threads(); + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); put_online_cpus(); } @@ -696,6 +735,8 @@ static int proc_watchdog_update(void) else watchdog_disable_all_cpus(); + watchdog_nmi_reconfigure(); + return err; } @@ -881,12 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask) != 0) + if (watchdog_update_cpus() != 0) pr_err("cpumask update failed\n"); -#endif } + + watchdog_nmi_reconfigure(); } out: mutex_unlock(&watchdog_proc_mutex); -- cgit v1.2.3 From e2ae8ab4b571e2e4094a28acb60649bc2732c67f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jul 2017 14:35:58 -0700 Subject: kexec_file: adjust declaration of kexec_purgatory Defining kexec_purgatory as a zero-length char array upsets compile time size checking. Since this is built on a per-arch basis, define it as an unsized char array (like is done for other similar things, e.g. linker sections). This silences the warning generated by the future CONFIG_FORTIFY_SOURCE, which did not like the memcmp() of a "0 byte" array. This drops the __weak and uses an extern instead, since both users define kexec_purgatory. Link: http://lkml.kernel.org/r/1497903987-21002-4-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Acked-by: "Eric W. Biederman" Cc: Daniel Micay Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 7 ------- kernel/kexec_internal.h | 2 ++ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c8f7f77e9fa9..9f48f4412297 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,13 +26,6 @@ #include #include "kexec_internal.h" -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - static int kexec_calculate_store_digests(struct kimage *image); /* Architectures can provide this probe function */ diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 799a8a452187..50dfcb039a41 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -17,6 +17,8 @@ extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE #include void kimage_file_post_load_cleanup(struct kimage *image); +extern char kexec_purgatory[]; +extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ -- cgit v1.2.3 From 7cd815bce828220deffd1654265f0ef891567774 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 12 Jul 2017 14:36:20 -0700 Subject: fork,random: use get_random_canary() to set tsk->stack_canary Use the ascii-armor canary to prevent unterminated C string overflows from being able to successfully overwrite the canary, even if they somehow obtain the canary value. Inspired by execshield ascii-armor and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524155751.424-3-riel@redhat.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ade237a96308..17921b0390b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -554,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_long(); + tsk->stack_canary = get_random_canary(); #endif /* -- cgit v1.2.3 From 69f0d429c413fe96db2c187475cebcc6e3a8c7f5 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 13 Jul 2017 14:18:24 +0800 Subject: locking/rtmutex: Remove unnecessary priority adjustment We don't need to adjust priority before adding a new pi_waiter, the priority only needs to be updated after pi_waiter change or task priority change. Steven Rostedt pointed out: "Interesting, I did some git mining and this was added with the original entry of the rtmutex.c (23f78d4a03c5). Looking at even that version, I don't see the purpose of adjusting the task prio here. It is done before anything changes in the task." Signed-off-by: Alex Shi Reviewed-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499926704-28841-1-git-send-email-alex.shi@linaro.org [ Enhance the changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 78069895032a..649dc9d3951a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; -- cgit v1.2.3 From 0e4097c3354e2f5a5ad8affd9dc7f7f7d00bb6b9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Jul 2017 00:40:28 -0700 Subject: sched/cputime: Don't use smp_processor_id() in preemptible context Recent kernels trigger this warning: BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181 caller is debug_smp_processor_id+0x17/0x19 CPU: 0 PID: 181 Comm: 99-trinity Not tainted 4.12.0-01059-g2a42eb9 #1 Call Trace: dump_stack+0x82/0xb8 check_preemption_disabled() debug_smp_processor_id() vtime_delta() task_cputime() thread_group_cputime() thread_group_cputime_adjusted() wait_consider_task() do_wait() SYSC_wait4() do_syscall_64() entry_SYSCALL64_slow_path() As Frederic pointed out: | Although those sched_clock_cpu() things seem to only matter when the | sched_clock() is unstable. And that stability is a condition for nohz_full | to work anyway. So probably sched_clock() alone would be enough. This patch fixes it by replacing sched_clock_cpu() with sched_clock() to avoid calling smp_processor_id() in a preemptible context. Reported-by: Xiaolong Ye Signed-off-by: Wanpeng Li Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499586028-7402-1-git-send-email-wanpeng.li@hotmail.com [ Prettified the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6e3ea4ac1bda..14d2dbf97c53 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,7 +683,7 @@ static u64 vtime_delta(struct vtime *vtime) { unsigned long long clock; - clock = sched_clock_cpu(smp_processor_id()); + clock = sched_clock(); if (clock < vtime->starttime) return 0; @@ -814,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(smp_processor_id()); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); } @@ -826,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(cpu); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } -- cgit v1.2.3 From 193be41e33168a3a06eb9d356d9e39c69de161d2 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:24:29 -0700 Subject: sched/deadline: Fix confusing comments about selection of top pi-waiter This comment in the code is incomplete, and I believe it begs a definition of dl_boosted to make sense of the condition that follows. Rewrite the comment and also rearrange the condition that follows to reflect the first condition "we have a top pi-waiter which is a SCHED_DEADLINE task" in that order. Also fix a typo that follows. Signed-off-by: Joel Fernandes Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170713022429.10307-1-joelaf@google.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a84299f44b5d..755bd3f1a1a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) struct sched_dl_entity *pi_se = &p->dl; /* - * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (absolute) deadline is - * smaller than our one... OTW we keep our runtime and - * deadline. + * Use the scheduling parameters of the top pi-waiter task if: + * - we have a top pi-waiter which is a SCHED_DEADLINE task AND + * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is + * smaller than our deadline OR we are a !SCHED_DEADLINE task getting + * boosted due to a SCHED_DEADLINE pi-waiter). + * Otherwise we keep our runtime and deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { + if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceedes its + * that is going to be deboosted, but exceeds its * runtime while doing so. No point in replenishing * it, as it's going to return back to its original * scheduling class after this. -- cgit v1.2.3 From 5f92a7b0fcd627fbd06ceb1cee3bbe5d08d13356 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jul 2017 14:49:46 -0700 Subject: kernel/watchdog.c: use better pr_fmt prefix After commit 73ce0511c436 ("kernel/watchdog.c: move hardlockup detector to separate file"), 'NMI watchdog' is inappropriate in kernel/watchdog.c, using 'watchdog' only. Link: http://lkml.kernel.org/r/1499928642-48983-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Babu Moger Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cabe3e9fb620..06d3389bca0d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,7 +9,7 @@ * to those contributors as well. */ -#define pr_fmt(fmt) "NMI watchdog: " fmt +#define pr_fmt(fmt) "watchdog: " fmt #include #include -- cgit v1.2.3 From 6d7964a722afc8e4f880b947f174009063028c99 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 14 Jul 2017 14:50:11 -0700 Subject: kmod: throttle kmod thread limit If we reach the limit of modprobe_limit threads running the next request_module() call will fail. The original reason for adding a kill was to do away with possible issues with in old circumstances which would create a recursive series of request_module() calls. We can do better than just be super aggressive and reject calls once we've reached the limit by simply making pending callers wait until the threshold has been reduced, and then throttling them in, one by one. This throttling enables requests over the kmod concurrent limit to be processed once a pending request completes. Only the first item queued up to wait is woken up. The assumption here is once a task is woken it will have no other option to also kick the queue to check if there are more pending tasks -- regardless of whether or not it was successful. By throttling and processing only max kmod concurrent tasks we ensure we avoid unexpected fatal request_module() calls, and we keep memory consumption on module loading to a minimum. With x86_64 qemu, with 4 cores, 4 GiB of RAM it takes the following run time to run both tests: time ./kmod.sh -t 0008 real 0m16.366s user 0m0.883s sys 0m8.916s time ./kmod.sh -t 0009 real 0m50.803s user 0m0.791s sys 0m9.852s Link: http://lkml.kernel.org/r/20170628223155.26472-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Cc: Jessica Yu Cc: Shuah Khan Cc: Rusty Russell Cc: Michal Marek Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index ff68198fe83b..6d016c5d97c8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -68,6 +68,7 @@ static DECLARE_RWSEM(umhelper_sem); */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); +static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* modprobe_path is set via /proc/sys. @@ -140,7 +141,6 @@ int __request_module(bool wait, const char *fmt, ...) va_list args; char module_name[MODULE_NAME_LEN]; int ret; - static int kmod_loop_msg; /* * We don't allow synchronous module loading from async. Module @@ -164,14 +164,11 @@ int __request_module(bool wait, const char *fmt, ...) return ret; if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { - /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg < 5) { - printk(KERN_ERR - "request_module: runaway loop modprobe %s\n", - module_name); - kmod_loop_msg++; - } - return -ENOMEM; + pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", + atomic_read(&kmod_concurrent_max), + MAX_KMOD_CONCURRENT, module_name); + wait_event_interruptible(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0); } trace_module_request(module_name, wait, _RET_IP_); @@ -179,6 +176,7 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_inc(&kmod_concurrent_max); + wake_up(&kmod_wq); return ret; } -- cgit v1.2.3 From a696712c3dd54eb58d2c5a807b4aaa27782d80d6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 17 Jul 2017 19:47:02 +0200 Subject: genirq/PM: Properly pretend disabled state when force resuming interrupts Interrupts with the IRQF_FORCE_RESUME flag set have also the IRQF_NO_SUSPEND flag set. They are not disabled in the suspend path, but must be forcefully resumed. That's used by XEN to keep IPIs enabled beyond the suspension of device irqs. Force resume works by pretending that the interrupt was disabled and then calling __irq_enable(). Incrementing the disabled depth counter was enough to do that, but with the recent changes which use state flags to avoid unnecessary hardware access, this is not longer sufficient. If the state flags are not set, then the hardware callbacks are not invoked and the interrupt line stays disabled in "hardware". Set the disabled and masked state when pretending that an interrupt got disabled by suspend. Fixes: bf22ff45bed6 ("genirq: Avoid unnecessary low level irq function calls") Suggested-by: Thomas Gleixner Signed-off-by: Juergen Gross Signed-off-by: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Cc: boris.ostrovsky@oracle.com Link: http://lkml.kernel.org/r/20170717174703.4603-2-jgross@suse.com --- kernel/irq/chip.c | 10 ---------- kernel/irq/internals.h | 10 ++++++++++ kernel/irq/pm.c | 2 ++ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d171bc57e1e0..a3cc37c0c85e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } -static void irq_state_set_disabled(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -} - static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -static void irq_state_set_masked(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); -} - static void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index dbfba9933ed2..a2c48058354c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +static inline void irq_state_set_disabled(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); +} + +static inline void irq_state_set_masked(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); +} + #undef __irqd_to_state static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..6bd9b58429cc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc) /* Pretend that it got disabled ! */ desc->depth++; + irq_state_set_disabled(desc); + irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); -- cgit v1.2.3 From 848618857d2535176037bdc085f8d012d907071f Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:14:16 -0700 Subject: tracing/ring_buffer: Try harder to allocate ftrace can fail to allocate per-CPU ring buffer on systems with a large number of CPUs coupled while large amounts of cache happening in the page cache. Currently the ring buffer allocation doesn't retry in the VM implementation even if direct-reclaim made some progress but still wasn't able to find a free page. On retrying I see that the allocations almost always succeed. The retry doesn't happen because __GFP_NORETRY is used in the tracer to prevent the case where we might OOM, however if we drop __GFP_NORETRY, we risk destabilizing the system if OOM killer is triggered. To prevent this situation, use the __GFP_RETRY_MAYFAIL flag introduced recently [1]. Tested the following still succeeds without destabilizing a system with 1GB memory. echo 300000 > /sys/kernel/debug/tracing/buffer_size_kb [1] https://marc.info/?l=linux-mm&m=149820805124906&w=2 Link: http://lkml.kernel.org/r/20170713021416.8897-1-joelaf@google.com Cc: Tim Murray Cc: Ingo Molnar Cc: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ae268e687fe..529cc50d7243 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) for (i = 0; i < nr_pages; i++) { struct page *page; /* - * __GFP_NORETRY flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is - * not destabilized. + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, + GFP_KERNEL | __GFP_RETRY_MAYFAIL, cpu_to_node(cpu)); if (!bpage) goto free_pages; @@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); if (!page) goto free_pages; bpage->page = page_address(page); -- cgit v1.2.3 From b0659ae5e30074ede1dc08f2c6d64f0c11d64e0f Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 18 Jul 2017 14:37:24 +0800 Subject: audit: fix memleak in auditd_send_unicast_skb. Found this issue by kmemleak report, auditd_send_unicast_skb did not free skb if rcu_dereference(auditd_conn) returns null. unreferenced object 0xffff88082568ce00 (size 256): comm "auditd", pid 1119, jiffies 4294708499 backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_node+0xcc/0x210 [] __alloc_skb+0x5d/0x290 [] audit_make_reply+0x54/0xd0 [] audit_receive_msg+0x967/0xd70 ---------------- (gdb) list *audit_receive_msg+0x967 0xffffffff8113dff7 is in audit_receive_msg (kernel/audit.c:1133). 1132 skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); --------------- [] audit_receive+0x52/0xa0 [] netlink_unicast+0x181/0x240 [] netlink_sendmsg+0x2c2/0x3b0 [] sock_sendmsg+0x38/0x50 [] SYSC_sendto+0x102/0x190 [] SyS_sendto+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x1a/0xa5 [] 0xffffffffffffffff Signed-off-by: Shu Wang Signed-off-by: Paul Moore --- kernel/audit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7cad70214b81..07def5e49cc9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -641,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); + kfree_skb(skb); rc = -ECONNREFUSED; goto err; } -- cgit v1.2.3 From 3bda69c1c3993a2bddbae01397d12bfef6054011 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 18 Jul 2017 14:08:34 +0300 Subject: perf/core: Fix scheduling regression of pinned groups Vince Weaver reported: > I was tracking down some regressions in my perf_event_test testsuite. > Some of the tests broke in the 4.11-rc1 timeframe. > > I've bisected one of them, this report is about > tests/overflow/simul_oneshot_group_overflow > This test creates an event group containing two sampling events, set > to overflow to a signal handler (which disables and then refreshes the > event). > > On a good kernel you get the following: > Event perf::instructions with period 1000000 > Event perf::instructions with period 2000000 > fd 3 overflows: 946 (perf::instructions/1000000) > fd 4 overflows: 473 (perf::instructions/2000000) > Ending counts: > Count 0: 946379875 > Count 1: 946365218 > > With the broken kernels you get: > Event perf::instructions with period 1000000 > Event perf::instructions with period 2000000 > fd 3 overflows: 938 (perf::instructions/1000000) > fd 4 overflows: 318 (perf::instructions/2000000) > Ending counts: > Count 0: 946373080 > Count 1: 653373058 The root cause of the bug is that the following commit: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts") erronously assumed that event's 'pinned' setting determines whether the event belongs to a pinned group or not, but in fact, it's the group leader's pinned state that matters. This was discovered by Vince in the test case described above, where two instruction counters are grouped, the group leader is pinned, but the other event is not; in the regressed case the counters were off by 33% (the difference between events' periods), but should be the same within the error margin. Fix the problem by looking at the group leader's pinning. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts") Link: http://lkml.kernel.org/r/87lgnmvw7h.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9747e422ab20..c9cdbd396770 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1452,6 +1452,13 @@ static enum event_type_t get_event_type(struct perf_event *event) lockdep_assert_held(&ctx->lock); + /* + * It's 'group type', really, because if our group leader is + * pinned, so are we. + */ + if (event->group_leader != event) + event = event->group_leader; + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; -- cgit v1.2.3 From db9108e054700c96322b0f0028546aa4e643cf0b Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Thu, 20 Jul 2017 18:36:09 +0800 Subject: tracing: Fix kmemleak in instance_rmdir Hit the kmemleak when executing instance_rmdir, it forgot releasing mem of tracing_cpumask. With this fix, the warn does not appear any more. unreferenced object 0xffff93a8dfaa7c18 (size 8): comm "mkdir", pid 1436, jiffies 4294763622 (age 9134.308s) hex dump (first 8 bytes): ff ff ff ff ff ff ff ff ........ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc_node+0xf1/0x280 [] alloc_cpumask_var_node+0x23/0x30 [] alloc_cpumask_var+0xe/0x10 [] instance_mkdir+0x90/0x240 [] tracefs_syscall_mkdir+0x40/0x70 [] vfs_mkdir+0x109/0x1b0 [] SyS_mkdir+0xd0/0x100 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/1500546969-12594-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: ccfe9e42e451 ("tracing: Make tracing_cpumask available for all instances") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d0ffcc49dba..42b9355033d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7774,6 +7774,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); -- cgit v1.2.3 From f86f418059b94aa01f9342611a272ca60c583e89 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Wed, 7 Jun 2017 16:12:51 +0800 Subject: trace: fix the errors caused by incompatible type of RCU variables The variables which are processed by RCU functions should be annotated as RCU, otherwise sparse will report the errors like below: "error: incompatible types in comparison expression (different address spaces)" Link: http://lkml.kernel.org/r/1496823171-7758-1-git-send-email-zhang.chunyan@linaro.org Signed-off-by: Chunyan Zhang [ Updated to not be 100% 80 column strict ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 6 +++--- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53f6b6401cf0..02004ae91860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) mutex_lock(&ftrace_lock); - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) + for (ops = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + ops != &ftrace_list_end; + ops = rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock))) cnt++; mutex_unlock(&ftrace_lock); @@ -275,10 +278,11 @@ static void update_ftrace_function(void) * If there's only one ftrace_ops registered, the ftrace_ops_list * will point to the ops we want. */ - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); /* If there's no ftrace_ops registered, just call the stub function */ - if (ftrace_ops_list == &ftrace_list_end) { + if (set_function_trace_op == &ftrace_list_end) { func = ftrace_stub; /* @@ -286,7 +290,8 @@ static void update_ftrace_function(void) * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - } else if (ftrace_ops_list->next == &ftrace_list_end) { + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { func = ftrace_ops_get_list_func(ftrace_ops_list); } else { @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) return ftrace_trace_function == ftrace_ops_list_func; } -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { - ops->next = *list; + rcu_assign_pointer(ops->next, *list); + /* * We are entering ops into the list but another * CPU might be walking that list. We need to make sure @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) rcu_assign_pointer(*list, ops); } -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (*list == ops && ops->next == &ftrace_list_end) { + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { *list = &ftrace_list_end; return 0; } @@ -1569,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); if (hash_contains_ip(ip, &hash)) ret = 1; @@ -2840,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If there's no more ops registered with ftrace, run a * sanity check to make sure all rec flags are cleared. */ - if (ftrace_ops_list == &ftrace_list_end) { + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -6453,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) update_ftrace_function(); ftrace_startup_sysctl(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6ade1c55cc3a..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1210,9 +1210,9 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred *preds; - struct filter_pred *root; - char *filter_string; + struct filter_pred __rcu *preds; + struct filter_pred __rcu *root; + char *filter_string; }; struct event_subsystem { -- cgit v1.2.3 From 4cabc5b186b5427b9ee5a7495172542af105f02b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jul 2017 00:00:21 +0200 Subject: bpf: fix mixed signed/unsigned derived min/max value bounds Edward reported that there's an issue in min/max value bounds tracking when signed and unsigned compares both provide hints on limits when having unknown variables. E.g. a program such as the following should have been rejected: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff8a94cda93400 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 11: (65) if r1 s> 0x1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 14: (b7) r0 = 0 15: (95) exit What happens is that in the first part ... 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 ... r1 carries an unsigned value, and is compared as unsigned against a register carrying an immediate. Verifier deduces in reg_set_min_max() that since the compare is unsigned and operation is greater than (>), that in the fall-through/false case, r1's minimum bound must be 0 and maximum bound must be r2. Latter is larger than the bound and thus max value is reset back to being 'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now 'R1=inv,min_value=0'. The subsequent test ... 11: (65) if r1 s> 0x1 goto pc+2 ... is a signed compare of r1 with immediate value 1. Here, verifier deduces in reg_set_min_max() that since the compare is signed this time and operation is greater than (>), that in the fall-through/false case, we can deduce that r1's maximum bound must be 1, meaning with prior test, we result in r1 having the following state: R1=inv,min_value=0,max_value=1. Given that the actual value this holds is -8, the bounds are wrongly deduced. When this is being added to r0 which holds the map_value(_adj) type, then subsequent store access in above case will go through check_mem_access() which invokes check_map_access_adj(), that will then probe whether the map memory is in bounds based on the min_value and max_value as well as access size since the actual unknown value is min_value <= x <= max_value; commit fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{, _adj} register types") provides some more explanation on the semantics. It's worth to note in this context that in the current code, min_value and max_value tracking are used for two things, i) dynamic map value access via check_map_access_adj() and since commit 06c1c049721a ("bpf: allow helpers access to variable memory") ii) also enforced at check_helper_mem_access() when passing a memory address (pointer to packet, map value, stack) and length pair to a helper and the length in this case is an unknown value defining an access range through min_value/max_value in that case. The min_value/max_value tracking is /not/ used in the direct packet access case to track ranges. However, the issue also affects case ii), for example, the following crafted program based on the same principle must be rejected as well: 0: (b7) r2 = 0 1: (bf) r3 = r10 2: (07) r3 += -512 3: (7a) *(u64 *)(r10 -16) = -8 4: (79) r4 = *(u64 *)(r10 -16) 5: (b7) r6 = -1 6: (2d) if r4 > r6 goto pc+5 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 7: (65) if r4 s> 0x1 goto pc+4 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 8: (07) r4 += 1 9: (b7) r5 = 0 10: (6a) *(u16 *)(r10 -512) = 0 11: (85) call bpf_skb_load_bytes#26 12: (b7) r0 = 0 13: (95) exit Meaning, while we initialize the max_value stack slot that the verifier thinks we access in the [1,2] range, in reality we pass -7 as length which is interpreted as u32 in the helper. Thus, this issue is relevant also for the case of helper ranges. Resetting both bounds in check_reg_overflow() in case only one of them exceeds limits is also not enough as similar test can be created that uses values which are within range, thus also here learned min value in r1 is incorrect when mixed with later signed test to create a range: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff880ad081fa00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (65) if r1 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 14: (b7) r0 = 0 15: (95) exit This leaves us with two options for fixing this: i) to invalidate all prior learned information once we switch signed context, ii) to track min/max signed and unsigned boundaries separately as done in [0]. (Given latter introduces major changes throughout the whole verifier, it's rather net-next material, thus this patch follows option i), meaning we can derive bounds either from only signed tests or only unsigned tests.) There is still the case of adjust_reg_min_max_vals(), where we adjust bounds on ALU operations, meaning programs like the following where boundaries on the reg get mixed in context later on when bounds are merged on the dst reg must get rejected, too: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff89b2bf87ce00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+6 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (b7) r7 = 1 12: (65) if r7 s> 0x0 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp 13: (b7) r0 = 0 14: (95) exit from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp 15: (0f) r7 += r1 16: (65) if r7 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 17: (0f) r0 += r7 18: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 19: (b7) r0 = 0 20: (95) exit Meaning, in adjust_reg_min_max_vals() we must also reset range values on the dst when src/dst registers have mixed signed/ unsigned derived min/max value bounds with one unbounded value as otherwise they can be added together deducing false boundaries. Once both boundaries are established from either ALU ops or compare operations w/o mixing signed/unsigned insns, then they can safely be added to other regs also having both boundaries established. Adding regs with one unbounded side to a map value where the bounded side has been learned w/o mixing ops is possible, but the resulting map value won't recover from that, meaning such op is considered invalid on the time of actual access. Invalid bounds are set on the dst reg in case i) src reg, or ii) in case dst reg already had them. The only way to recover would be to perform i) ALU ops but only 'add' is allowed on map value types or ii) comparisons, but these are disallowed on pointers in case they span a range. This is fine as only BPF_JEQ and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers which potentially turn them into PTR_TO_MAP_VALUE type depending on the branch, so only here min/max value cannot be invalidated for them. In terms of state pruning, value_from_signed is considered as well in states_equal() when dealing with adjusted map values. With regards to breaking existing programs, there is a small risk, but use-cases are rather quite narrow where this could occur and mixing compares probably unlikely. Joint work with Josef and Edward. [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html Fixes: 484611357c19 ("bpf: allow access into map value arrays") Reported-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Edward Cree Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 94 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723c5b64..af9e84a4944e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].value_from_signed = false; regs[regno].min_align = 0; } @@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } -static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) { - if (env->allow_ptr_leaks) + if (allow_ptr_leaks) return false; - switch (env->cur_state.regs[regno].type) { + switch (reg->type) { case UNKNOWN_VALUE: case CONST_IMM: return false; @@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +{ + return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +} + static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { @@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_align = dst_reg->min_align; /* We don't know anything about what was done to this register, mark it - * as unknown. + * as unknown. Also, if both derived bounds came from signed/unsigned + * mixed compares and one side is unbounded, we cannot really do anything + * with them as boundaries cannot be trusted. Thus, arithmetic of two + * regs of such kind will get invalidated bounds on the dst side. */ - if (min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) { + if ((min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (BPF_SRC(insn->code) == BPF_X && + ((min_val != BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (min_val == BPF_REGISTER_MIN_RANGE && + max_val != BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && + dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && + dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && + regs[insn->dst_reg].value_from_signed != + regs[insn->src_reg].value_from_signed)) { reset_reg_range_values(regs, insn->dst_reg); return; } @@ -2023,6 +2044,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); + regs[insn->dst_reg].value_from_signed = false; } } else if (opcode > BPF_END) { @@ -2198,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. */ false_reg->max_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val + 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. */ false_reg->max_value = val - 1; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2239,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg @@ -2248,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* * If this is false, then the val is <= the register, if it is * true the register <= to the val. */ false_reg->min_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val - 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* If this is false then constant < register, if it is true then * the register < constant. */ false_reg->min_value = val + 1; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2290,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, -- cgit v1.2.3 From 2aeb1883547626d82c597cce2c99f0b9c62e2425 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 20 Jul 2017 16:14:55 +0200 Subject: perf/core: Fix locking for children siblings group read We're missing ctx lock when iterating children siblings within the perf_read path for group reading. Following race and crash can happen: User space doing read syscall on event group leader: T1: perf_read lock event->ctx->mutex perf_read_group lock leader->child_mutex __perf_read_group_add(child) list_for_each_entry(sub, &leader->sibling_list, group_entry) ----> sub might be invalid at this point, because it could get removed via perf_event_exit_task_context in T2 Child exiting and cleaning up its events: T2: perf_event_exit_task_context lock ctx->mutex list_for_each_entry_safe(child_event, next, &child_ctx->event_list,... perf_event_exit_event(child) lock ctx->lock perf_group_detach(child) unlock ctx->lock ----> child is removed from sibling_list without any sync with T1 path above ... free_event(child) Before the child is removed from the leader's child_list, (and thus is omitted from perf_read_group processing), we need to ensure that perf_read_group touches child's siblings under its ctx->lock. Peter further notes: | One additional note; this bug got exposed by commit: | | ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") | | which made it possible to actually trigger this code-path. Tested-by: Andi Kleen Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") Link: http://lkml.kernel.org/r/20170720141455.2106-1-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c9cdbd396770..c17c0881fd36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4372,7 +4372,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { + struct perf_event_context *ctx = leader->ctx; struct perf_event *sub; + unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -4402,12 +4404,15 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + raw_spin_unlock_irqrestore(&ctx->lock, flags); return 0; } -- cgit v1.2.3