From 0680eb1f485ba5aac2ee02c9f0622239c9a4b16c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 13 Aug 2014 12:47:14 -0700 Subject: timekeeping: Another fix to the VSYSCALL_OLD update_vsyscall Benjamin Herrenschmidt pointed out that I further missed modifying update_vsyscall after the wall_to_mono value was changed to a timespec64. This causes issues on powerpc32, which expects a 32bit timespec. This patch fixes the problem by properly converting from a timespec64 to a timespec before passing the value on to the arch-specific vsyscall logic. [ Thomas is currently on vacation, but reviewed it and wanted me to send this fix on to you directly. ] Cc: LKML Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Benjamin Herrenschmidt Reported-by: Benjamin Herrenschmidt Reviewed-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f36b02838a47..fb4a9c2cf8d9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); static inline void update_vsyscall(struct timekeeper *tk) { - struct timespec xt; + struct timespec xt, wm; xt = timespec64_to_timespec(tk_xtime(tk)); - update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, + wm = timespec64_to_timespec(tk->wall_to_monotonic); + update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, tk->tkr.cycle_last); } -- cgit v1.2.3 From b5e995e671d8e4d7a75b339ce78ecc586014b0eb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 12 Jun 2014 16:24:41 +0530 Subject: nohz: Fix spurious periodic tick behaviour in low-res dynticks mode When we reach the end of the tick handler, we unconditionally reschedule the next tick to the next jiffy. Then on irq exit, the nohz code overrides that setting if needed and defers the next tick as far away in the future as possible. Now in the best dynticks case, when we actually don't need any tick in the future (ie: expires == KTIME_MAX), low-res and high-res behave differently. What we want in this case is to cancel the next tick programmed by the previous one. That's what we do in high-res mode. OTOH we lack a low-res mode equivalent of hrtimer_cancel() so we simply don't do anything in this case and the next tick remains scheduled to jiffies + 1. As a result, in low-res mode, when the dynticks code determines that no tick is needed in the future, we can recursively get a spurious tick every jiffy because then the next tick is always reprogrammed from the tick handler and is never cancelled. And this can happen indefinetly until some subsystem actually needs a precise tick in the future and only then we eventually overwrite the previous tick handler setting to defer the next tick. We are fixing this by introducing the ONESHOT_STOPPED mode which will let us pause a clockevent when no further interrupt is needed. Meanwhile we can't expect all drivers to support this new mode. So lets reduce much of the symptoms by skipping the nohz-blind tick rescheduling from the tick-handler when the CPU is in dynticks mode. That tick rescheduling wrongly assumed periodicity and the low-res dynticks code can't cancel such decision. This breaks the recursive (and thus the worst) part of the problem. In the worst case now, we'll get only one extra tick due to uncancelled tick scheduled before we entered dynticks mode. This also removes a needless clockevent write on idle ticks. Since those clock write are usually considered to be slow, it's a general win. Reviewed-by: Preeti U Murthy Signed-off-by: Viresh Kumar Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..153870a91350 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -968,6 +968,10 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(now); tick_sched_handle(ts, regs); + /* No need to reprogram if we are running tickless */ + if (unlikely(ts->tick_stopped)) + return; + while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); -- cgit v1.2.3 From 2a16fc93d2c9568e16d45db77c7b5f15e1921cf1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 12 Jun 2014 16:24:41 +0530 Subject: nohz: Avoid tick's double reprogramming in highres mode In highres mode, the tick reschedules itself unconditionally to the next jiffies. However while this clock reprogramming is relevant when the tick is in periodic mode, it's not that interesting when we run in dynticks mode because irq exit is likely going to overwrite the next tick to some randomly deferred future. So lets just get rid of this tick self rescheduling in dynticks mode. This way we can avoid some clockevents double write in favourable scenarios like when we stop the tick completely in idle while no other hrtimer is pending. Suggested-by: Frederic Weisbecker Signed-off-by: Viresh Kumar Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 153870a91350..cc0a5b6f741b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1099,6 +1099,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) if (regs) tick_sched_handle(ts, regs); + /* No need to reprogram if we are in idle or full dynticks mode */ + if (unlikely(ts->tick_stopped)) + return HRTIMER_NORESTART; + hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; -- cgit v1.2.3 From 22127e93c587afa01e4f7225d2d1cf1d26ae7dfe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:25 -0500 Subject: time: Replace __get_cpu_var uses Convert uses of __get_cpu_var for creating a address from a percpu offset to this_cpu_ptr. The two cases where get_cpu_var is used to actually access a percpu variable are changed to use this_cpu_read/raw_cpu_read. Reviewed-by: Thomas Gleixner Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/time/hrtimer.c | 6 +++--- kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 6 +++--- kernel/time/tick-oneshot.c | 2 +- kernel/time/tick-sched.c | 20 ++++++++++---------- kernel/time/timer.c | 2 +- 6 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..5f2229ba53d6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - cpu_base = &__raw_get_cpu_var(hrtimer_bases); + cpu_base = raw_cpu_ptr(&hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; @@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) struct hrtimer_cpu_base *cpu_base; int base = hrtimer_clockid_to_base(which_clock); - cpu_base = &__raw_get_cpu_var(hrtimer_bases); + cpu_base = raw_cpu_ptr(&hrtimer_bases); *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; @@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void) if (!hrtimer_hres_active()) return; - td = &__get_cpu_var(tick_cpu_device); + td = this_cpu_ptr(&tick_cpu_device); if (td && td->evtdev) hrtimer_interrupt(td->evtdev); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 64c5990fd500..066f0ec05e48 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) void tick_check_oneshot_broadcast_this_cpu(void) { if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); /* * We might be in the middle of switching over from diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..decfb5f6edb0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td, void tick_install_replacement(struct clock_event_device *newdev) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); clockevents_exchange_device(td->evtdev, newdev); @@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup) void tick_suspend(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); } void tick_resume(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int broadcast = tick_resume_broadcast(); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 824109060a33..7ce740e78e1b 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..73f90932282b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); */ void __tick_nohz_full_check(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_nohz_full_cpu(smp_processor_id())) { if (ts->tick_stopped && !is_idle_task(current)) { @@ -545,7 +545,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires, ret = { .tv64 = 0 }; unsigned long rcu_delta_jiffies; - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 time_delta; time_delta = timekeeping_max_deferment(); @@ -813,7 +813,7 @@ void tick_nohz_idle_enter(void) local_irq_disable(); - ts = &__get_cpu_var(tick_cpu_sched); + ts = this_cpu_ptr(&tick_cpu_sched); ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); */ void tick_nohz_irq_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) __tick_nohz_idle_enter(ts); @@ -846,7 +846,7 @@ void tick_nohz_irq_exit(void) */ ktime_t tick_nohz_get_sleep_length(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->sleep_length; } @@ -959,7 +959,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) */ static void tick_nohz_handler(struct clock_event_device *dev) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); @@ -979,7 +979,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) */ static void tick_nohz_switch_to_nohz(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) @@ -1115,7 +1115,7 @@ early_param("skew_tick", skew_tick); */ void tick_setup_sched_timer(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* @@ -1184,7 +1184,7 @@ void tick_clock_notify(void) */ void tick_oneshot_notify(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } @@ -1199,7 +1199,7 @@ void tick_oneshot_notify(void) */ int tick_check_oneshot_change(int allow_nohz) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..04d8ed8399b0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer) static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key) { - struct tvec_base *base = __raw_get_cpu_var(tvec_bases); + struct tvec_base *base = raw_cpu_read(tvec_bases); timer->entry.next = NULL; timer->base = (void *)((unsigned long)base | flags); -- cgit v1.2.3 From dc5df73b3afffc8d042dadffc1c959008b2c1163 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:26 -0500 Subject: time: Convert a bunch of &__get_cpu_var introduced in the 3.16 merge period Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/time/hrtimer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5f2229ba53d6..a50600d87fb7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (!hrtimer_hres_active()) return; @@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) */ debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); - reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * We must preserve the CALLBACK state flag here, * otherwise we could move the timer base in @@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * on dynticks target. */ wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && + } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && hrtimer_reprogram(timer, new_base)) { /* * Only allow reprogramming if the new base is on this CPU. @@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; @@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) */ void hrtimer_interrupt(struct clock_event_device *dev) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; int i, retries = 0; @@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void) void hrtimer_run_queues(void) { struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base; int index, gettime = 1; @@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu) local_irq_disable(); old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &__get_cpu_var(hrtimer_bases); + new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. -- cgit v1.2.3 From 4a32fea9d78f2d2315c0072757b197d5a304dc8b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:27 -0500 Subject: scheduler: Replace __get_cpu_var with this_cpu_ptr Convert all uses of __get_cpu_var for address calculation to use this_cpu_ptr instead. [Uses of __get_cpu_var with cpumask_var_t are no longer handled by this patch] Cc: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/time/tick-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 73f90932282b..3cadc112519f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -924,7 +924,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ void tick_nohz_idle_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; local_irq_disable(); @@ -1041,7 +1041,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) static inline void tick_nohz_irq_enter(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) -- cgit v1.2.3 From 40bea039593dfc7f3f9814dab844f6db43ae580b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Aug 2014 18:50:16 +0200 Subject: nohz: Restore NMI safe local irq work for local nohz kick The local nohz kick is currently used by perf which needs it to be NMI-safe. Recent commit though (7d1311b93e58ed55f3a31cc8f94c4b8fe988a2b9) changed its implementation to fire the local kick using the remote kick API. It was convenient to make the code more generic but the remote kick isn't NMI-safe. As a result: WARNING: CPU: 3 PID: 18062 at kernel/irq_work.c:72 irq_work_queue_on+0x11e/0x140() CPU: 3 PID: 18062 Comm: trinity-subchil Not tainted 3.16.0+ #34 0000000000000009 00000000903774d1 ffff880244e06c00 ffffffff9a7f1e37 0000000000000000 ffff880244e06c38 ffffffff9a0791dd ffff880244fce180 0000000000000003 ffff880244e06d58 ffff880244e06ef8 0000000000000000 Call Trace: [] dump_stack+0x4e/0x7a [] warn_slowpath_common+0x7d/0xa0 [] warn_slowpath_null+0x1a/0x20 [] irq_work_queue_on+0x11e/0x140 [] tick_nohz_full_kick_cpu+0x57/0x90 [] __perf_event_overflow+0x275/0x350 [] ? perf_event_task_disable+0xa0/0xa0 [] ? x86_perf_event_set_period+0xbf/0x150 [] perf_event_overflow+0x14/0x20 [] intel_pmu_handle_irq+0x206/0x410 [] ? arch_vtime_task_switch+0x63/0x130 [] perf_event_nmi_handler+0x2b/0x50 [] nmi_handle+0xd2/0x390 [] ? nmi_handle+0x5/0x390 [] ? lock_release+0xab/0x330 [] default_do_nmi+0x72/0x1c0 [] ? cpuacct_account_field+0xcf/0x200 [] do_nmi+0xb8/0x100 Lets fix this by restoring the use of local irq work for the nohz local kick. Reported-by: Catalin Iacob Reported-and-tested-by: Dave Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..f654a8a298fa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -224,6 +224,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_work_func, }; +/* + * Kick this CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. + */ +void tick_nohz_full_kick(void) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. -- cgit v1.2.3 From 9bf2419fa7bffa16ce58a4d5c20399eff8c970c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2014 12:24:49 +0200 Subject: timekeeping: Update timekeeper before updating vsyscall and pvclock The update_walltime() code works on the shadow timekeeper to make the seqcount protected region as short as possible. But that update to the shadow timekeeper does not update all timekeeper fields because it's sufficient to do that once before it becomes life. One of these fields is tkr.base_mono. That stays stale in the shadow timekeeper unless an operation happens which copies the real timekeeper to the shadow. The update function is called after the update calls to vsyscall and pvclock. While not correct, it did not cause any problems because none of the invoked update functions used base_mono. commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based) changed that in the kvm pvclock update function, so the stale mono_base value got used and caused kvm-clock to malfunction. Put the update where it belongs and fix the issue. Reported-by: Chris J Arges Reported-by: Paolo Bonzini Cc: Gleb Natapov Cc: John Stultz Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1409050000570.3333@nanos Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb4a9c2cf8d9..ec1791fae965 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -442,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) tk->ntp_error = 0; ntp_clear(); } - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk_update_ktime_data(tk); + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); -- cgit v1.2.3 From e78c3496790ee8a36522a838b59b388e8a709e65 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 16 Aug 2014 13:40:10 -0400 Subject: time, signal: Protect resource use statistics with seqlock Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: David Rientjes Cc: Dongsheng Yang Cc: Geert Uytterhoeven Cc: Guillaume Morin Cc: Ionut Alexa Cc: Kees Cook Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Vladimir Davydov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/time/posix-cpu-timers.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) -- cgit v1.2.3 From d78c9300c51d6ceed9f6d078d4e9366f259de28c Mon Sep 17 00:00:00 2001 From: Andrew Hunter Date: Thu, 4 Sep 2014 14:17:16 -0700 Subject: jiffies: Fix timeval conversion to jiffies timeval_to_jiffies tried to round a timeval up to an integral number of jiffies, but the logic for doing so was incorrect: intervals corresponding to exactly N jiffies would become N+1. This manifested itself particularly repeatedly stopping/starting an itimer: setitimer(ITIMER_PROF, &val, NULL); setitimer(ITIMER_PROF, NULL, &val); would add a full tick to val, _even if it was exactly representable in terms of jiffies_ (say, the result of a previous rounding.) Doing this repeatedly would cause unbounded growth in val. So fix the math. Here's what was wrong with the conversion: we essentially computed (eliding seconds) jiffies = usec * (NSEC_PER_USEC/TICK_NSEC) by using scaling arithmetic, which took the best approximation of NSEC_PER_USEC/TICK_NSEC with denominator of 2^USEC_JIFFIE_SC = x/(2^USEC_JIFFIE_SC), and computed: jiffies = (usec * x) >> USEC_JIFFIE_SC and rounded this calculation up in the intermediate form (since we can't necessarily exactly represent TICK_NSEC in usec.) But the scaling arithmetic is a (very slight) *over*approximation of the true value; that is, instead of dividing by (1 usec/ 1 jiffie), we effectively divided by (1 usec/1 jiffie)-epsilon (rounding down). This would normally be fine, but we want to round timeouts up, and we did so by adding 2^USEC_JIFFIE_SC - 1 before the shift; this would be fine if our division was exact, but dividing this by the slightly smaller factor was equivalent to adding just _over_ 1 to the final result (instead of just _under_ 1, as desired.) In particular, with HZ=1000, we consistently computed that 10000 usec was 11 jiffies; the same was true for any exact multiple of TICK_NSEC. We could possibly still round in the intermediate form, adding something less than 2^USEC_JIFFIE_SC - 1, but easier still is to convert usec->nsec, round in nanoseconds, and then convert using time*spec*_to_jiffies. This adds one constant multiplication, and is not observably slower in microbenchmarks on recent x86 hardware. Tested: the following program: int main() { struct itimerval zero = {{0, 0}, {0, 0}}; /* Initially set to 10 ms. */ struct itimerval initial = zero; initial.it_interval.tv_usec = 10000; setitimer(ITIMER_PROF, &initial, NULL); /* Save and restore several times. */ for (size_t i = 0; i < 10; ++i) { struct itimerval prev; setitimer(ITIMER_PROF, &zero, &prev); /* on old kernels, this goes up by TICK_USEC every iteration */ printf("previous value: %ld %ld %ld %ld\n", prev.it_interval.tv_sec, prev.it_interval.tv_usec, prev.it_value.tv_sec, prev.it_value.tv_usec); setitimer(ITIMER_PROF, &prev, NULL); } return 0; } Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Paul Turner Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Paul Turner Reported-by: Aaron Jacobs Signed-off-by: Andrew Hunter [jstultz: Tweaked to apply to 3.17-rc] Signed-off-by: John Stultz --- kernel/time/time.c | 56 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/time.c b/kernel/time/time.c index f0294ba14634..a9ae20fb0b11 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -559,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies); * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * Note that due to the small error in the multiplier here, this + * rounding is incorrect for sufficiently large values of tv_nsec, but + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're + * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. */ -unsigned long -timespec_to_jiffies(const struct timespec *value) +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) { - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; + nsec = nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; @@ -580,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value) (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } + +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); +} + EXPORT_SYMBOL(timespec_to_jiffies); void @@ -596,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) } EXPORT_SYMBOL(jiffies_to_timespec); -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. +/* + * We could use a similar algorithm to timespec_to_jiffies (with a + * different multiplier for usec instead of nsec). But this has a + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the + * usec value, since it's not necessarily integral. + * + * We could instead round in the intermediate scaled representation + * (i.e. in units of 1/2^(large scale) jiffies) but that's also + * perilous: the scaling introduces a small positive error, which + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 + * units to the intermediate before shifting) leads to accidental + * overflow and overestimates. + * + * At the cost of one additional multiplication by a constant, just + * use the timespec implementation. */ unsigned long timeval_to_jiffies(const struct timeval *value) { - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + return __timespec_to_jiffies(value->tv_sec, + value->tv_usec * NSEC_PER_USEC); } EXPORT_SYMBOL(timeval_to_jiffies); -- cgit v1.2.3 From e86fea764991e00a03ff1e56409ec9cacdbda4c9 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:03 -0700 Subject: alarmtimer: Return relative times in timer_gettime Returns the time remaining for an alarm timer, rather than the time at which it is scheduled to expire. If the timer has already expired or it is not currently scheduled, the it_value's members are set to zero. This new behavior matches that of the other posix-timers and the POSIX specifications. This is a change in user-visible behavior, and may break existing applications. Hopefully, few users rely on the old incorrect behavior. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque [jstultz: minor style tweak] Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4aec4a457431..b4bce62e47b2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -541,18 +541,22 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @new_timer: k_itimer pointer * @cur_setting: itimerspec data to fill * - * Copies the itimerspec data out from the k_itimer + * Copies out the current itimerspec data */ static void alarm_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { - memset(cur_setting, 0, sizeof(struct itimerspec)); + ktime_t relative_expiry_time = + alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); - cur_setting->it_interval = - ktime_to_timespec(timr->it.alarm.interval); - cur_setting->it_value = - ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); - return; + if (ktime_to_ns(relative_expiry_time) > 0) { + cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + } else { + cur_setting->it_value.tv_sec = 0; + cur_setting->it_value.tv_nsec = 0; + } + + cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); } /** -- cgit v1.2.3 From 265b81d23a46c39df0a735a3af4238954b41a4c2 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:04 -0700 Subject: alarmtimer: Do not signal SIGEV_NONE timers Avoids sending a signal to alarm timers created with sigev_notify set to SIGEV_NONE by checking for that special case in the timeout callback. The regular posix timers avoid sending signals to SIGEV_NONE timers by not scheduling any callbacks for them in the first place. Although it would be possible to do something similar for alarm timers, it's simpler to handle this as a special case in the timeout. Prior to this patch, the alarm timer would ignore the sigev_notify value and try to deliver signals to the process anyway. Even worse, the sanity check for the value of sigev_signo is skipped when SIGEV_NONE was specified, so the signal number could be bogus. If sigev_signo was an unitialized value (as it often would be if SIGEV_NONE is used), then it's hard to predict which signal will be sent. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index b4bce62e47b2..41a925396830 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -466,8 +466,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, { struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); - if (posix_timer_event(ptr, 0) != 0) - ptr->it_overrun++; + if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; + } /* Re-add periodic timers */ if (ptr->it.alarm.interval.tv64) { -- cgit v1.2.3 From 474e941bed9262f5fa2394f9a4a67e24499e5926 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:05 -0700 Subject: alarmtimer: Lock k_itimer during timer callback Locks the k_itimer's it_lock member when handling the alarm timer's expiry callback. The regular posix timers defined in posix-timers.c have this lock held during timout processing because their callbacks are routed through posix_timer_fn(). The alarm timers follow a different path, so they ought to grab the lock somewhere else. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 41a925396830..a7077d3ae52f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -464,8 +464,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { + unsigned long flags; struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); + enum alarmtimer_restart result = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { if (posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; @@ -475,9 +479,11 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, if (ptr->it.alarm.interval.tv64) { ptr->it_overrun += alarm_forward(alarm, now, ptr->it.alarm.interval); - return ALARMTIMER_RESTART; + result = ALARMTIMER_RESTART; } - return ALARMTIMER_NORESTART; + spin_unlock_irqrestore(&ptr->it_lock, flags); + + return result; } /** -- cgit v1.2.3 From a80e49e2cc3145af014a8ae44f575829cc236192 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 Aug 2014 17:47:18 +0200 Subject: nohz: Move nohz full init call to tick init This way we unbloat a bit main.c and more importantly we initialize nohz full after init_IRQ(). This dependency will be needed in further patches because nohz full needs irq work to raise its own IRQ. Information about the support for this ability on ARM64 is obtained on init_IRQ() which initialize the pointer to __smp_call_function. Since tick_init() is called right after init_IRQ(), this is a good place to call tick_nohz_init() and prepare for that dependency. Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-common.c | 1 + kernel/time/tick-internal.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..052b4b53c3d6 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -400,4 +400,5 @@ void tick_resume(void) void __init tick_init(void) { tick_broadcast_init(); + tick_nohz_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c19c1d84b6f3..366aeb4f2c66 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline bool tick_broadcast_oneshot_available(void) { return false; } #endif /* !TICK_ONESHOT */ +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } +#endif + /* * Broadcasting support */ -- cgit v1.2.3 From 76a33061b9323b7fdb220ae5fa116c10833ec22e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 Aug 2014 18:37:19 +0200 Subject: irq_work: Force raised irq work to run on irq work interrupt The nohz full kick, which restarts the tick when any resource depend on it, can't be executed anywhere given the operation it does on timers. If it is called from the scheduler or timers code, chances are that we run into a deadlock. This is why we run the nohz full kick from an irq work. That way we make sure that the kick runs on a virgin context. However if that's the case when irq work runs in its own dedicated self-ipi, things are different for the big bunch of archs that don't support the self triggered way. In order to support them, irq works are also handled by the timer interrupt as fallback. Now when irq works run on the timer interrupt, the context isn't blank. More precisely, they can run in the context of the hrtimer that runs the tick. But the nohz kick cancels and restarts this hrtimer and cancelling an hrtimer from itself isn't allowed. This is why we run in an endless loop: Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2 CPU: 2 PID: 7538 Comm: kworker/u8:8 Not tainted 3.16.0+ #34 Workqueue: btrfs-endio-write normal_work_helper [btrfs] ffff880244c06c88 000000001b486fe1 ffff880244c06bf0 ffffffff8a7f1e37 ffffffff8ac52a18 ffff880244c06c78 ffffffff8a7ef928 0000000000000010 ffff880244c06c88 ffff880244c06c20 000000001b486fe1 0000000000000000 Call Trace: ] dump_stack+0x4e/0x7a [] panic+0xd4/0x207 [] watchdog_overflow_callback+0x118/0x120 [] __perf_event_overflow+0xae/0x350 [] ? perf_event_task_disable+0xa0/0xa0 [] ? x86_perf_event_set_period+0xbf/0x150 [] perf_event_overflow+0x14/0x20 [] intel_pmu_handle_irq+0x206/0x410 [] perf_event_nmi_handler+0x2b/0x50 [] nmi_handle+0xd2/0x390 [] ? nmi_handle+0x5/0x390 [] ? match_held_lock+0x8/0x1b0 [] default_do_nmi+0x72/0x1c0 [] do_nmi+0xb8/0x100 [] end_repeat_nmi+0x1e/0x2e [] ? match_held_lock+0x8/0x1b0 [] ? match_held_lock+0x8/0x1b0 [] ? match_held_lock+0x8/0x1b0 <] lock_acquired+0xaf/0x450 [] ? lock_hrtimer_base.isra.20+0x25/0x50 [] _raw_spin_lock_irqsave+0x78/0x90 [] ? lock_hrtimer_base.isra.20+0x25/0x50 [] lock_hrtimer_base.isra.20+0x25/0x50 [] hrtimer_try_to_cancel+0x33/0x1e0 [] hrtimer_cancel+0x1a/0x30 [] tick_nohz_restart+0x17/0x90 [] __tick_nohz_full_check+0xc3/0x100 [] nohz_full_kick_work_func+0xe/0x10 [] irq_work_run_list+0x44/0x70 [] irq_work_run+0x2a/0x50 [] update_process_times+0x5b/0x70 [] tick_sched_handle.isra.21+0x25/0x60 [] tick_sched_timer+0x41/0x60 [] __run_hrtimer+0x72/0x470 [] ? tick_sched_do_timer+0xb0/0xb0 [] hrtimer_interrupt+0x117/0x270 [] local_apic_timer_interrupt+0x37/0x60 [] smp_apic_timer_interrupt+0x3f/0x50 [] apic_timer_interrupt+0x6f/0x80 To fix this we force non-lazy irq works to run on irq work self-IPIs when available. That ability of the arch to trigger irq work self IPIs is available with arch_irq_work_has_interrupt(). Reported-by: Catalin Iacob Reported-by: Dave Jones Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..9bbb8344ed3b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1385,7 +1385,7 @@ void update_process_times(int user_tick) rcu_check_callbacks(cpu, user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) - irq_work_run(); + irq_work_tick(); #endif scheduler_tick(); run_posix_cpu_timers(p); -- cgit v1.2.3 From 4327b15f64b2580dad40d2674d50fc44f1b699c1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Aug 2014 22:02:55 +0200 Subject: nohz: Consolidate nohz full init code The supports for CONFIG_NO_HZ_FULL_ALL=y and the nohz_full= kernel parameter both have their own way to do the same thing: allocate full dynticks cpumasks, fill them and initialize some state variables. Lets consolidate that all in the same place. While at it, convert some regular printk message to warnings when fundamental allocations fail. Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f654a8a298fa..eb4af016ac65 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -295,22 +295,12 @@ out: /* Parse the boot-time nohz CPU list from the kernel parameters. */ static int __init tick_nohz_full_setup(char *str) { - int cpu; - alloc_bootmem_cpumask_var(&tick_nohz_full_mask); - alloc_bootmem_cpumask_var(&housekeeping_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } - - cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); - cpumask_clear_cpu(cpu, tick_nohz_full_mask); - } - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); tick_nohz_full_running = true; return 1; @@ -349,18 +339,11 @@ static int tick_nohz_init_all(void) #ifdef CONFIG_NO_HZ_FULL_ALL if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); - return err; - } - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); + WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } err = 0; cpumask_setall(tick_nohz_full_mask); - cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); - cpumask_clear(housekeeping_mask); - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); tick_nohz_full_running = true; #endif return err; @@ -375,6 +358,23 @@ void __init tick_nohz_init(void) return; } + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } + + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); + for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); -- cgit v1.2.3 From 9b01f5bf3999a3db5b1bbd9fdfd80d8d304e94ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 18 Aug 2014 01:36:07 +0200 Subject: nohz: nohz full depends on irq work self IPI support The nohz full functionality depends on IRQ work to trigger its own interrupts. As it's used to restart the tick, we can't rely on the tick fallback for irq work callbacks, ie: we can't use the tick to restart the tick itself. Lets reject the full dynticks initialization if that arch support isn't available. As a side effect, this makes sure that nohz kick is never called from the tick. That otherwise would result in illegal hrtimer self-cancellation and lockup. Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index eb4af016ac65..5a9ff243588c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -365,6 +365,20 @@ void __init tick_nohz_init(void) return; } + /* + * Full dynticks uses irq work to drive the tick rescheduling on safe + * locking contexts. But then we need irq work to raise its own + * interrupts to avoid circular dependency on the tick + */ + if (!arch_irq_work_has_interrupt()) { + pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " + "support irq work self-IPIs\n"); + cpumask_clear(tick_nohz_full_mask); + cpumask_copy(housekeeping_mask, cpu_possible_mask); + tick_nohz_full_running = false; + return; + } + cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { -- cgit v1.2.3 From f139caf2e89713687514d9db847a4fa2e29c87a2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:40:54 +0400 Subject: sched, cleanup, treewide: Remove set_current_state(TASK_RUNNING) after schedule() schedule(), io_schedule() and schedule_timeout() always return with TASK_RUNNING state set, so one more setting is unnecessary. (All places in patch are visible good, only exception is kiblnd_scheduler() from: drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c Its schedule() is one line above standard 3 lines of unified diff) No places where set_current_state() is used for mb(). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529254.3569.23.camel@tkhai Cc: Alasdair Kergon Cc: Anil Belur Cc: Arnd Bergmann Cc: Dave Kleikamp Cc: David Airlie Cc: David Howells Cc: Dmitry Eremin Cc: Frank Blaschka Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Helge Deller Cc: Isaac Huang Cc: James E.J. Bottomley Cc: James E.J. Bottomley Cc: J. Bruce Fields Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Laura Abbott Cc: Liang Zhen Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Masaru Nomura Cc: Michael Opdenacker Cc: Mikael Starvik Cc: Mike Snitzer Cc: Neil Brown Cc: Oleg Drokin Cc: Peng Tao Cc: Richard Weinberger Cc: Robert Love Cc: Steven Rostedt Cc: Trond Myklebust Cc: Ursula Braun Cc: Zi Shen Lim Cc: devel@driverdev.osuosl.org Cc: dm-devel@redhat.com Cc: dri-devel@lists.freedesktop.org Cc: fcoe-devel@open-fcoe.org Cc: jfs-discussion@lists.sourceforge.net Cc: linux390@de.ibm.com Cc: linux-afs@lists.infradead.org Cc: linux-cris-kernel@axis.com Cc: linux-kernel@vger.kernel.org Cc: linux-nfs@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linux-raid@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: qla2xxx-upstream@qlogic.com Cc: user-mode-linux-devel@lists.sourceforge.net Cc: user-mode-linux-user@lists.sourceforge.net Signed-off-by: Ingo Molnar --- kernel/time/hrtimer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..ab370ffffd53 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, */ if (!expires) { schedule(); - __set_current_state(TASK_RUNNING); return -EINTR; } -- cgit v1.2.3 From fe0f49768d807a8fe6336b097feb8c4441951710 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 30 Sep 2014 17:37:52 +0200 Subject: s390/nohz: use a per-cpu flag for arch_needs_cpu Move the nohz_delay bit from the s390_idle data structure to the per-cpu flags. Clear the nohz delay flag in __cpu_disable and remove the cpu hotplug notifier that used to do this. Signed-off-by: Martin Schwidefsky --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f654a8a298fa..01d512fd45f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -572,7 +572,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || - arch_needs_cpu(cpu) || irq_work_needs_cpu()) { + arch_needs_cpu() || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { -- cgit v1.2.3 From 6891c4509c792209c44ced55a60f13954cb50ef4 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 4 Oct 2014 23:06:39 +0200 Subject: posix-timers: Fix stack info leak in timer_create() If userland creates a timer without specifying a sigevent info, we'll create one ourself, using a stack local variable. Particularly will we use the timer ID as sival_int. But as sigev_value is a union containing a pointer and an int, that assignment will only partially initialize sigev_value on systems where the size of a pointer is bigger than the size of an int. On such systems we'll copy the uninitialized stack bytes from the timer_create() call to userland when the timer actually fires and we're going to deliver the signal. Initialize sigev_value with 0 to plug the stack info leak. Found in the PaX patch, written by the PaX Team. Fixes: 5a9fa7307285 ("posix-timers: kill ->it_sigev_signo and...") Signed-off-by: Mathias Krause Cc: Oleg Nesterov Cc: Brad Spengler Cc: PaX Team Cc: # v2.6.28+ Link: http://lkml.kernel.org/r/1412456799-32339-1-git-send-email-minipli@googlemail.com Signed-off-by: Thomas Gleixner --- kernel/time/posix-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/time') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..31ea01f42e1f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; -- cgit v1.2.3 From 10632008b9e18b76cbff0ffc69c15e948aa548e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Oct 2014 15:07:50 +0400 Subject: clockevents: Prevent shift out of bounds Andrey reported that on a kernel with UBSan enabled he found: UBSan: Undefined behaviour in ../kernel/time/clockevents.c:75:34 I guess it should be 1ULL here instead of 1U: (!ismax || evt->mult <= (1U << evt->shift))) That's indeed the correct solution because shift might be 32. Reported-by: Andrey Ryabinin Cc: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..55449909f114 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && - (!ismax || evt->mult <= (1U << evt->shift))) + (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); -- cgit v1.2.3