From 07b65a800b6d5b6afbd6a91487b47038eac97c21 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Feb 2023 15:46:43 +0100 Subject: timers/nohz: Only ever update sleeptime from idle exit The idle and IO sleeptime statistics appearing in /proc/stat can be currently updated from two sites: locally on idle exit and remotely by cpufreq. However there is no synchronization mechanism protecting concurrent updates. It is therefore possible to account the sleeptime twice, among all the other possible broken scenarios. To prevent from breaking the sleeptime accounting source, restrict the sleeptime updates to the local idle exit site. If there is a delta to add since the last update, IO/Idle sleep time readers will now only compute the delta without actually writing it back to the internal idle statistic fields. This fixes a writer VS writer race. Note there are still two known reader VS writer races to handle. A subsequent patch will fix one. Reported-by: Yu Liao Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230222144649.624380-3-frederic@kernel.org --- kernel/time/tick-sched.c | 95 +++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 58 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b0e3c9205946..9058b9eb8bc1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -637,31 +637,21 @@ static void tick_nohz_update_jiffies(ktime_t now) touch_softlockup_watchdog_sched(); } -/* - * Updates the per-CPU time idle statistics counters - */ -static void -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - if (nr_iowait_cpu(cpu) > 0) - ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); - else - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_entrytime = now; - } + if (WARN_ON_ONCE(!ts->idle_active)) + return; - if (last_update_time) - *last_update_time = ktime_to_us(now); + delta = ktime_sub(now, ts->idle_entrytime); -} + if (nr_iowait_cpu(smp_processor_id()) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); -static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) -{ - update_ts_time_stats(smp_processor_id(), ts, now, NULL); + ts->idle_entrytime = now; ts->idle_active = 0; sched_clock_idle_wakeup_event(); @@ -674,6 +664,30 @@ static void tick_nohz_start_idle(struct tick_sched *ts) sched_clock_idle_sleep_event(); } +static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, + bool compute_delta, u64 *last_update_time) +{ + ktime_t now, idle; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) + *last_update_time = ktime_to_us(now); + + if (ts->idle_active && compute_delta) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(*sleeptime, delta); + } else { + idle = *sleeptime; + } + + return ktime_to_us(idle); + +} + /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query @@ -691,27 +705,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts) u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, idle; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - idle = ts->idle_sleeptime; - } else { - if (ts->idle_active && !nr_iowait_cpu(cpu)) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - idle = ktime_add(ts->idle_sleeptime, delta); - } else { - idle = ts->idle_sleeptime; - } - } - - return ktime_to_us(idle); + return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, + !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); @@ -732,26 +728,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, iowait; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - iowait = ts->iowait_sleeptime; - } else { - if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - iowait = ktime_add(ts->iowait_sleeptime, delta); - } else { - iowait = ts->iowait_sleeptime; - } - } - return ktime_to_us(iowait); + return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, + nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); -- cgit v1.2.3 From 620a30fa0bd14878891b22bf2261e6ed4587c2b4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Feb 2023 15:46:44 +0100 Subject: timers/nohz: Protect idle/iowait sleep time under seqcount Reading idle/IO sleep time (eg: from /proc/stat) can race with idle exit updates because the state machine handling the stats is not atomic and requires a coherent read batch. As a result reading the sleep time may report irrelevant or backward values. Fix this with protecting the simple state machine within a seqcount. This is expected to be cheap enough not to add measurable performance impact on the idle path. Note this only fixes reader VS writer condition partitially. A race remains that involves remote updates of the CPU iowait task counter. It can hardly be fixed. Reported-by: Yu Liao Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230222144649.624380-4-frederic@kernel.org --- kernel/time/tick-sched.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9058b9eb8bc1..90d9b7b29875 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -646,6 +646,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) delta = ktime_sub(now, ts->idle_entrytime); + write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else @@ -653,14 +654,18 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) ts->idle_entrytime = now; ts->idle_active = 0; + write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { + write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); ts->idle_active = 1; + write_seqcount_end(&ts->idle_sleeptime_seq); + sched_clock_idle_sleep_event(); } @@ -668,6 +673,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; + unsigned int seq; if (!tick_nohz_active) return -1; @@ -676,13 +682,17 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, if (last_update_time) *last_update_time = ktime_to_us(now); - if (ts->idle_active && compute_delta) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); + do { + seq = read_seqcount_begin(&ts->idle_sleeptime_seq); - idle = ktime_add(*sleeptime, delta); - } else { - idle = *sleeptime; - } + if (ts->idle_active && compute_delta) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(*sleeptime, delta); + } else { + idle = *sleeptime; + } + } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); -- cgit v1.2.3 From ead70b75237371c735a481a9843b411cfbb18404 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Feb 2023 15:46:45 +0100 Subject: timers/nohz: Add a comment about broken iowait counter update race The per-cpu iowait task counter is incremented locally upon sleeping. But since the task can be woken to (and by) another CPU, the counter may then be decremented remotely. This is the source of a race involving readers VS writer of idle/iowait sleeptime. The following scenario shows an example where a /proc/stat reader observes a pending sleep time as IO whereas that pending sleep time later eventually gets accounted as non-IO. CPU 0 CPU 1 CPU 2 ----- ----- ------ //io_schedule() TASK A current->in_iowait = 1 rq(0)->nr_iowait++ //switch to idle // READ /proc/stat // See nr_iowait_cpu(0) == 1 return ts->iowait_sleeptime + ktime_sub(ktime_get(), ts->idle_entrytime) //try_to_wake_up(TASK A) rq(0)->nr_iowait-- //idle exit // See nr_iowait_cpu(0) == 0 ts->idle_sleeptime += ktime_sub(ktime_get(), ts->idle_entrytime) As a result subsequent reads on /proc/stat may expose backward progress. This is unfortunately hardly fixable. Just add a comment about that condition. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230222144649.624380-5-frederic@kernel.org --- kernel/time/tick-sched.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 90d9b7b29875..edd6e9f26d16 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -705,7 +705,10 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, * counters if NULL. * * Return the cumulative idle time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -728,7 +731,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * counters if NULL. * * Return the cumulative iowait time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. -- cgit v1.2.3 From 289dafed3851a7693a47896f5d09420bf6046ef2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Feb 2023 15:46:46 +0100 Subject: timers/nohz: Remove middle-function __tick_nohz_idle_stop_tick() There is no need for the __tick_nohz_idle_stop_tick() function between tick_nohz_idle_stop_tick() and its implementation. Remove that unnecessary step. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230222144649.624380-6-frederic@kernel.org --- kernel/time/tick-sched.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel/time/tick-sched.c') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index edd6e9f26d16..3b53b894ca98 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1079,10 +1079,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) +/** + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + */ +void tick_nohz_idle_stop_tick(void) { - ktime_t expires; + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); + ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the @@ -1114,16 +1120,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) } } -/** - * tick_nohz_idle_stop_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - */ -void tick_nohz_idle_stop_tick(void) -{ - __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); -} - void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); -- cgit v1.2.3