From 0e7767687fdabfc58d5046e7488632bf2ecd4d0c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Apr 2018 18:58:27 +0200 Subject: time: tick-sched: Reorganize idle tick management code Prepare the scheduler tick code for reworking the idle loop to avoid stopping the tick in some cases. The idea is to split the nohz idle entry call to decouple the idle time stats accounting and preparatory work from the actual tick stop code, in order to later be able to delay the tick stop once we reach more power-knowledgeable callers. Move away the tick_nohz_start_idle() invocation from __tick_nohz_idle_enter(), rename the latter to __tick_nohz_idle_stop_tick() and define tick_nohz_idle_stop_tick() as a wrapper around it for calling it from the outside. Make tick_nohz_idle_enter() only call tick_nohz_start_idle() instead of calling the entire __tick_nohz_idle_enter(), add another wrapper disabling and enabling interrupts around tick_nohz_idle_stop_tick() and make the current callers of tick_nohz_idle_enter() call it too to retain their current functionality. Signed-off-by: Rafael J. Wysocki Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) --- kernel/time/tick-sched.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5d4a0342f934..678349aec483 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -528,14 +528,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) sched_clock_idle_wakeup_event(); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static void tick_nohz_start_idle(struct tick_sched *ts) { - ktime_t now = ktime_get(); - - ts->idle_entrytime = now; + ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); - return now; } /** @@ -894,19 +891,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_enter(struct tick_sched *ts) +static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { - ktime_t now, expires; + ktime_t expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); - if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; ts->idle_calls++; - expires = tick_nohz_stop_sched_tick(ts, now, cpu); + /* + * The idle entry time should be a sufficient approximation of + * the current time at this point. + */ + expires = tick_nohz_stop_sched_tick(ts, ts->idle_entrytime, cpu); if (expires > 0LL) { ts->idle_sleeps++; ts->idle_expires = expires; @@ -920,16 +919,19 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) } /** - * tick_nohz_idle_enter - stop the idle tick from the idle task + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick - * Called when we start the idle loop. - * - * The arch is responsible of calling: + */ +void tick_nohz_idle_stop_tick(void) +{ + __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); +} + +/** + * tick_nohz_idle_enter - prepare for entering idle on the current CPU * - * - rcu_idle_enter() after its last use of RCU before the CPU is put - * to sleep. - * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { @@ -941,7 +943,7 @@ void tick_nohz_idle_enter(void) ts = this_cpu_ptr(&tick_cpu_sched); ts->inidle = 1; - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); local_irq_enable(); } @@ -958,10 +960,12 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->inidle) - __tick_nohz_idle_enter(ts); - else + if (ts->inidle) { + tick_nohz_start_idle(ts); + __tick_nohz_idle_stop_tick(ts); + } else { tick_nohz_full_update_tick(ts); + } } /** -- cgit v1.2.3 From 2aaf709a518d26563b80fd7a42379d7aa7ffed4a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 Mar 2018 23:05:50 +0100 Subject: sched: idle: Do not stop the tick upfront in the idle loop Push the decision whether or not to stop the tick somewhat deeper into the idle loop. Stopping the tick upfront leads to unpleasant outcomes in case the idle governor doesn't agree with the nohz code on the duration of the upcoming idle period. Specifically, if the tick has been stopped and the idle governor predicts short idle, the situation is bad regardless of whether or not the prediction is accurate. If it is accurate, the tick has been stopped unnecessarily which means excessive overhead. If it is not accurate, the CPU is likely to spend too much time in the (shallow, because short idle has been predicted) idle state selected by the governor [1]. As the first step towards addressing this problem, change the code to make the tick stopping decision inside of the loop in do_idle(). In particular, do not stop the tick in the cpu_idle_poll() code path. Also don't do that in tick_nohz_irq_exit() which doesn't really have enough information on whether or not to stop the tick. Link: https://marc.info/?l=linux-pm&m=150116085925208&w=2 # [1] Link: https://tu-dresden.de/zih/forschung/ressourcen/dateien/projekte/haec/powernightmares.pdf Suggested-by: Frederic Weisbecker Signed-off-by: Rafael J. Wysocki Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) --- kernel/time/tick-sched.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 678349aec483..f5d37788ea85 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -960,12 +960,10 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->inidle) { + if (ts->inidle) tick_nohz_start_idle(ts); - __tick_nohz_idle_stop_tick(ts); - } else { + else tick_nohz_full_update_tick(ts); - } } /** @@ -1026,6 +1024,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #endif } +static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) +{ + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); +} + +void tick_nohz_idle_restart_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, ktime_get()); +} + /** * tick_nohz_idle_exit - restart the idle tick from the idle task * @@ -1050,10 +1062,8 @@ void tick_nohz_idle_exit(void) if (ts->idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); - tick_nohz_account_idle_ticks(ts); - } + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); } -- cgit v1.2.3 From efefc97736e6f3261879bc9dddcb161224a455f5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 Mar 2018 10:11:28 +0100 Subject: jiffies: Introduce USER_TICK_USEC and redefine TICK_USEC Since the subsequent changes will need a TICK_USEC definition analogous to TICK_NSEC, rename the existing TICK_USEC as USER_TICK_USEC, update its users and redefine TICK_USEC accordingly. Suggested-by: Peter Zijlstra Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8d70da1b9a0d..a09ded765f6c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,7 +31,7 @@ /* USER_HZ period (usecs): */ -unsigned long tick_usec = TICK_USEC; +unsigned long tick_usec = USER_TICK_USEC; /* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; -- cgit v1.2.3 From 45f1ff59e27ca59d33cc1a317e669d90022ccf7d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 22 Mar 2018 17:50:49 +0100 Subject: cpuidle: Return nohz hint from cpuidle_select() Add a new pointer argument to cpuidle_select() and to the ->select cpuidle governor callback to allow a boolean value indicating whether or not the tick should be stopped before entering the selected state to be returned from there. Make the ladder governor ignore that pointer (to preserve its current behavior) and make the menu governor return 'false" through it if: (1) the idle exit latency is constrained at 0, or (2) the selected state is a polling one, or (3) the expected idle period duration is within the tick period range. In addition to that, the correction factor computations in the menu governor need to take the possibility that the tick may not be stopped into account to avoid artificially small correction factor values. To that end, add a mechanism to record tick wakeups, as suggested by Peter Zijlstra, and use it to modify the menu_update() behavior when tick wakeup occurs. Namely, if the CPU is woken up by the tick and the return value of tick_nohz_get_sleep_length() is not within the tick boundary, the predicted idle duration is likely too short, so make menu_update() try to compensate for that by updating the governor statistics as though the CPU was idle for a long time. Since the value returned through the new argument pointer of cpuidle_select() is not used by its caller yet, this change by itself is not expected to alter the functionality of the code. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- kernel/time/tick-sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f5d37788ea85..69fe113cfc7f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -966,6 +966,20 @@ void tick_nohz_irq_exit(void) tick_nohz_full_update_tick(ts); } +/** + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + */ +bool tick_nohz_idle_got_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->inidle > 1) { + ts->inidle = 1; + return true; + } + return false; +} + /** * tick_nohz_get_sleep_length - return the length of the current sleep * @@ -1077,6 +1091,9 @@ static void tick_nohz_handler(struct clock_event_device *dev) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); + if (ts->inidle) + ts->inidle = 2; + dev->next_event = KTIME_MAX; tick_sched_do_timer(now); @@ -1174,6 +1191,9 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); + if (ts->inidle) + ts->inidle = 2; + tick_sched_do_timer(now); /* -- cgit v1.2.3 From 23a8d888107ce4ce444eab2dcebf4cfb3578770b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Apr 2018 19:07:57 +0200 Subject: time: tick-sched: Split tick_nohz_stop_sched_tick() In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, split tick_nohz_stop_sched_tick() into two separate routines, one computing the time to the next timer event and the other simply stopping the tick when the time to the next timer event is known. Prepare these two routines to be called separately, as one of them will be called by the idle governor in the cpuidle_select() code path after subsequent changes. Update the former callers of tick_nohz_stop_sched_tick() to use the new routines, tick_nohz_next_event() and tick_nohz_stop_tick(), instead of it and move the updates of the sleep_length field in struct tick_sched into __tick_nohz_idle_stop_tick() as it doesn't need to be updated anywhere else. There should be no intentional visible changes in functionality resulting from this change. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 124 +++++++++++++++++++++++++++++------------------ kernel/time/tick-sched.h | 4 ++ 2 files changed, 82 insertions(+), 46 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69fe113cfc7f..f56d2c695712 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -641,13 +641,10 @@ static inline bool local_timer_softirq_pending(void) return local_softirq_pending() & TIMER_SOFTIRQ; } -static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long seq, basejiff; - ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { @@ -656,6 +653,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; + ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work @@ -700,47 +698,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { - tick = 0; + ts->timer_expires = 0; goto out; } } + /* + * If this CPU is the one which had the do_timer() duty last, we limit + * the sleep time to the timekeeping max_deferment value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu != tick_do_timer_cpu && + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + delta = KTIME_MAX; + + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; + + ts->timer_expires = min_t(u64, expires, next_tick); + +out: + return ts->timer_expires; +} + +static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 basemono = ts->timer_expires_base; + u64 expires = ts->timer_expires; + ktime_t tick = expires; + + /* Make sure we won't be trying to stop it twice in a row. */ + ts->timer_expires_base = 0; + /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this CPU - * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferment value. - * Otherwise we can sleep as long as we want. + * was the one which had the do_timer() duty last. */ - delta = timekeeping_max_deferment(); if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - delta = KTIME_MAX; ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - delta = KTIME_MAX; } - /* Calculate the next expiry time */ - if (delta < (KTIME_MAX - basemono)) - expires = basemono + delta; - else - expires = KTIME_MAX; - - expires = min_t(u64, expires, next_tick); - tick = expires; - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) - goto out; + return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", @@ -774,7 +788,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); - goto out; + return; } hrtimer_set_expires(&ts->sched_timer, tick); @@ -783,15 +797,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); -out: - /* - * Update the estimated sleep length until the next timer - * (not only the tick). - */ - ts->sleep_length = ktime_sub(dev->next_event, now); - return tick; } +static void tick_nohz_retain_tick(struct tick_sched *ts) +{ + ts->timer_expires_base = 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +{ + if (tick_nohz_next_event(ts, cpu)) + tick_nohz_stop_tick(ts, cpu); + else + tick_nohz_retain_tick(ts); +} +#endif /* CONFIG_NO_HZ_FULL */ + static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -827,7 +849,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) return; if (can_stop_full_tick(cpu, ts)) - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); #endif @@ -853,10 +875,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { - ts->sleep_length = NSEC_PER_SEC / HZ; + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; - } if (need_resched()) return false; @@ -893,29 +913,37 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); ktime_t expires; int cpu = smp_processor_id(); - if (can_stop_idle_tick(cpu, ts)) { + WARN_ON_ONCE(ts->timer_expires_base); + + if (!can_stop_idle_tick(cpu, ts)) + goto out; + + expires = tick_nohz_next_event(ts, cpu); + + ts->idle_calls++; + + if (expires > 0LL) { int was_stopped = ts->tick_stopped; - ts->idle_calls++; + tick_nohz_stop_tick(ts, cpu); - /* - * The idle entry time should be a sufficient approximation of - * the current time at this point. - */ - expires = tick_nohz_stop_sched_tick(ts, ts->idle_entrytime, cpu); - if (expires > 0LL) { - ts->idle_sleeps++; - ts->idle_expires = expires; - } + ts->idle_sleeps++; + ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } + } else { + tick_nohz_retain_tick(ts); } + +out: + ts->sleep_length = ktime_sub(dev->next_event, ts->idle_entrytime); } /** @@ -942,6 +970,9 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); + + WARN_ON_ONCE(ts->timer_expires_base); + ts->inidle = 1; tick_nohz_start_idle(ts); @@ -1067,6 +1098,7 @@ void tick_nohz_idle_exit(void) local_irq_disable(); WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 954b43dbf21c..53e45a39bdbc 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -39,6 +39,8 @@ enum tick_nohz_mode { * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @sleep_length: Duration of the current idle sleep + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle */ struct tick_sched { @@ -60,6 +62,8 @@ struct tick_sched { ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; + u64 timer_expires; + u64 timer_expires_base; u64 next_timer; ktime_t idle_expires; int do_timer_last; -- cgit v1.2.3 From a59855cd8c613ba4bb95147f6176360d95f75e60 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 3 Apr 2018 23:17:00 +0200 Subject: time: hrtimer: Introduce hrtimer_next_event_without() The next set of changes will need to compute the time to the next hrtimer event over all hrtimers except for the scheduler tick one. To that end introduce a new helper function, hrtimer_next_event_without(), for computing the time until the next hrtimer event over all timers except for one and modify the underlying code in __hrtimer_next_event_base() to prepare it for being called by that new function. No intentional changes in functionality. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- kernel/time/hrtimer.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 23788100e214..6d387dbd7304 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -490,6 +490,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { @@ -502,9 +503,24 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + struct rb_node *rbn = rb_next(&next->node); + + next = rb_entry_safe(rbn, struct timerqueue_node, node); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + if (timer->is_soft) cpu_base->softirq_next_timer = timer; else @@ -548,7 +564,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } @@ -556,7 +573,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); } return expires_next; @@ -1202,6 +1220,39 @@ u64 hrtimer_get_next_event(void) return expires; } + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (__hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) -- cgit v1.2.3 From 554c8aa8ecade210d58a252173bb8f2106552a44 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 3 Apr 2018 23:17:11 +0200 Subject: sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies Reported-by: Thomas Ilsche Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 61 ++++++++++++++++++++++++++++++++++++++---------- kernel/time/tick-sched.h | 2 -- 2 files changed, 49 insertions(+), 14 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f56d2c695712..c57c98c7e953 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -913,16 +913,19 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); ktime_t expires; int cpu = smp_processor_id(); - WARN_ON_ONCE(ts->timer_expires_base); - - if (!can_stop_idle_tick(cpu, ts)) - goto out; - - expires = tick_nohz_next_event(ts, cpu); + /* + * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the + * tick timer expiration time is known already. + */ + if (ts->timer_expires_base) + expires = ts->timer_expires; + else if (can_stop_idle_tick(cpu, ts)) + expires = tick_nohz_next_event(ts, cpu); + else + return; ts->idle_calls++; @@ -941,9 +944,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) } else { tick_nohz_retain_tick(ts); } - -out: - ts->sleep_length = ktime_sub(dev->next_event, ts->idle_entrytime); } /** @@ -956,6 +956,16 @@ void tick_nohz_idle_stop_tick(void) __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); } +void tick_nohz_idle_retain_tick(void) +{ + tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); + /* + * Undo the effect of get_next_timer_interrupt() called from + * tick_nohz_next_event(). + */ + timer_clear_idle(); +} + /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * @@ -1012,15 +1022,42 @@ bool tick_nohz_idle_got_tick(void) } /** - * tick_nohz_get_sleep_length - return the length of the current sleep + * tick_nohz_get_sleep_length - return the expected length of the current sleep * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_sleep_length(void) { + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + /* + * The idle entry time is expected to be a sufficient approximation of + * the current time at this point. + */ + ktime_t now = ts->idle_entrytime; + ktime_t next_event; + + WARN_ON_ONCE(!ts->inidle); + + if (!can_stop_idle_tick(cpu, ts)) + goto out_dev; + + next_event = tick_nohz_next_event(ts, cpu); + if (!next_event) + goto out_dev; + + /* + * If the next highres timer to expire is earlier than next_event, the + * idle governor needs to know that. + */ + next_event = min_t(u64, next_event, + hrtimer_next_event_without(&ts->sched_timer)); + + return ktime_sub(next_event, now); - return ts->sleep_length; +out_dev: + return ktime_sub(dev->next_event, now); } /** diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 53e45a39bdbc..2b845f2c44b1 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -38,7 +38,6 @@ enum tick_nohz_mode { * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @sleep_length: Duration of the current idle sleep * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle @@ -60,7 +59,6 @@ struct tick_sched { ktime_t idle_exittime; ktime_t idle_sleeptime; ktime_t iowait_sleeptime; - ktime_t sleep_length; unsigned long last_jiffies; u64 timer_expires; u64 timer_expires_base; -- cgit v1.2.3 From 296bb1e51a4838a6488ec5ce676607093482ecbc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Apr 2018 19:12:34 +0200 Subject: cpuidle: menu: Refine idle state selection for running tick If the tick isn't stopped, the target residency of the state selected by the menu governor may be greater than the actual time to the next tick and that means lost energy. To avoid that, make tick_nohz_get_sleep_length() return the current time to the next event (before stopping the tick) in addition to the estimated one via an extra pointer argument and make menu_select() use that value to refine the state selection when necessary. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- kernel/time/tick-sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c57c98c7e953..edb9d49b4996 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1023,10 +1023,11 @@ bool tick_nohz_idle_got_tick(void) /** * tick_nohz_get_sleep_length - return the expected length of the current sleep + * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled */ -ktime_t tick_nohz_get_sleep_length(void) +ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -1040,12 +1041,14 @@ ktime_t tick_nohz_get_sleep_length(void) WARN_ON_ONCE(!ts->inidle); + *delta_next = ktime_sub(dev->next_event, now); + if (!can_stop_idle_tick(cpu, ts)) - goto out_dev; + return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) - goto out_dev; + return *delta_next; /* * If the next highres timer to expire is earlier than next_event, the @@ -1055,9 +1058,6 @@ ktime_t tick_nohz_get_sleep_length(void) hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); - -out_dev: - return ktime_sub(dev->next_event, now); } /** -- cgit v1.2.3 From 2bc629a692a76b9ee3dab9c303e3f501bece66a4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 6 Apr 2018 04:32:37 +0200 Subject: nohz: Gather tick_sched booleans under a common flag field Optimize the space and leave plenty of room for further flags. Signed-off-by: Frederic Weisbecker [ rjw: Do not use __this_cpu_read() to access tick_stopped and add got_idle_tick to avoid overloading inidle ] Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 12 +++++++----- kernel/time/tick-sched.h | 12 ++++++++---- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index edb9d49b4996..a9d5cc7406d3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -465,7 +465,9 @@ __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { - return __this_cpu_read(tick_cpu_sched.tick_stopped); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) @@ -1014,8 +1016,8 @@ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (ts->inidle > 1) { - ts->inidle = 1; + if (ts->got_idle_tick) { + ts->got_idle_tick = 0; return true; } return false; @@ -1161,7 +1163,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) ktime_t now = ktime_get(); if (ts->inidle) - ts->inidle = 2; + ts->got_idle_tick = 1; dev->next_event = KTIME_MAX; @@ -1261,7 +1263,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) ktime_t now = ktime_get(); if (ts->inidle) - ts->inidle = 2; + ts->got_idle_tick = 1; tick_sched_do_timer(now); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 2b845f2c44b1..6de959a854b2 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -41,19 +41,24 @@ enum tick_nohz_mode { * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set */ struct tick_sched { struct hrtimer sched_timer; unsigned long check_clocks; enum tick_nohz_mode nohz_mode; + + unsigned int inidle : 1; + unsigned int tick_stopped : 1; + unsigned int idle_active : 1; + unsigned int do_timer_last : 1; + unsigned int got_idle_tick : 1; + ktime_t last_tick; ktime_t next_tick; - int inidle; - int tick_stopped; unsigned long idle_jiffies; unsigned long idle_calls; unsigned long idle_sleeps; - int idle_active; ktime_t idle_entrytime; ktime_t idle_waketime; ktime_t idle_exittime; @@ -64,7 +69,6 @@ struct tick_sched { u64 timer_expires_base; u64 next_timer; ktime_t idle_expires; - int do_timer_last; atomic_t tick_dep_mask; }; -- cgit v1.2.3 From ff7de6203131e3d60cda60aeda12c69373ca5d43 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Apr 2018 14:59:13 +0200 Subject: nohz: Avoid duplication of code related to got_idle_tick Move the code setting ts->got_idle_tick into tick_sched_do_timer() to avoid code duplication. No intentional changes in functionality. Suggested-by: Frederic Weisbecker Signed-off-by: Rafael J. Wysocki Reviewed-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a9d5cc7406d3..956831cf6cfb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -113,8 +113,7 @@ static ktime_t tick_init_jiffy_update(void) return period; } - -static void tick_sched_do_timer(ktime_t now) +static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -134,6 +133,9 @@ static void tick_sched_do_timer(ktime_t now) /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); + + if (ts->inidle) + ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) @@ -1162,12 +1164,9 @@ static void tick_nohz_handler(struct clock_event_device *dev) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - if (ts->inidle) - ts->got_idle_tick = 1; - dev->next_event = KTIME_MAX; - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); /* No need to reprogram if we are running tickless */ @@ -1262,10 +1261,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - if (ts->inidle) - ts->got_idle_tick = 1; - - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have -- cgit v1.2.3 From 7d2f6abb402ae38ec4bb7beabb3980bb834b1e0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 7 Apr 2018 19:11:55 +0200 Subject: time: hrtimer: Use timerqueue_iterate_next() to get to the next timer Use timerqueue_iterate_next() to get to the next timer in __hrtimer_next_event_base() without browsing the timerqueue details diredctly. No intentional changes in functionality. Suggested-by: Frederic Weisbecker Signed-off-by: Rafael J. Wysocki --- kernel/time/hrtimer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 6d387dbd7304..14e858753d76 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -505,9 +505,7 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ - struct rb_node *rbn = rb_next(&next->node); - - next = rb_entry_safe(rbn, struct timerqueue_node, node); + next = timerqueue_iterate_next(next); if (!next) continue; -- cgit v1.2.3 From bbe9a70a478129f3f9b2003415d0c36afcea210f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Apr 2018 14:23:33 +0200 Subject: tick-sched: avoid a maybe-uninitialized warning The use of bitfields seems to confuse gcc, leading to a false-positive warning in all compiler versions: kernel/time/tick-sched.c: In function 'tick_nohz_idle_exit': kernel/time/tick-sched.c:538:2: error: 'now' may be used uninitialized in this function [-Werror=maybe-uninitialized] This introduces a temporary variable to track the flags so gcc doesn't have to evaluate twice, eliminating the code path that leads to the warning. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85301 Fixes: 1cae544d42d2 ("nohz: Gather tick_sched booleans under a common flag field") Signed-off-by: Arnd Bergmann Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 956831cf6cfb..e35a6fced00c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1134,6 +1134,7 @@ void tick_nohz_idle_restart_tick(void) void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); @@ -1142,14 +1143,16 @@ void tick_nohz_idle_exit(void) WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; + idle_active = ts->idle_active; + tick_stopped = ts->tick_stopped; - if (ts->idle_active || ts->tick_stopped) + if (idle_active || tick_stopped) now = ktime_get(); - if (ts->idle_active) + if (idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) + if (tick_stopped) __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); -- cgit v1.2.3