diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 389 |
1 files changed, 261 insertions, 128 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0f8736991427..ff4dbbae3b10 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -468,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse) return NULL; } -static inline struct sched_entity *parent_entity(struct sched_entity *se) +static inline struct sched_entity *parent_entity(const struct sched_entity *se) { return se->parent; } @@ -595,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline bool entity_before(struct sched_entity *a, - struct sched_entity *b) +static inline bool entity_before(const struct sched_entity *a, + const struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } @@ -1804,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); - if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { if (READ_ONCE(rq->numa_migrate_on) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; @@ -1836,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env, int start = env->dst_cpu; /* Find alternative idle CPU. */ - for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) { if (cpu == env->best_cpu || !idle_cpu(cpu) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { continue; @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util, * * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less. - * - * In case of capacity inversion we should honour the inverted capacity - * for both uclamp_min and uclamp_max all the time. */ - capacity_orig = cpu_in_capacity_inversion(cpu); - if (capacity_orig) { - capacity_orig_thermal = capacity_orig; - } else { - capacity_orig = capacity_orig_of(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); - } + capacity_orig = capacity_orig_of(cpu); + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -4561,8 +4553,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) - fits = fits && (uclamp_min <= capacity_orig_thermal); + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + return -1; return fits; } @@ -4572,7 +4564,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); unsigned long util = task_util_est(p); - return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); + /* + * Return true only if the cpu fully fits the task requirements, which + * include the utilization but also the performance hints. + */ + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4656,6 +4652,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime; + u64 sleep_time; /* * The 'current' period is already promised to the current tasks, @@ -4685,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; } - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * Pull vruntime of the entity being placed to the base level of + * cfs_rq, to prevent boosting it if placed backwards. If the entity + * slept for a long time, don't even try to compare its vruntime with + * the base as it may be too far off and the comparison may get + * inversed due to s64 overflow. + */ + sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; + if ((s64)sleep_time > 60LL * NSEC_PER_SEC) + se->vruntime = vruntime; + else + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -4896,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se; s64 delta; - ideal_runtime = sched_slice(cfs_rq, curr); + /* + * When many tasks blow up the sched_period; it is possible that + * sched_slice() reports unusually large results (when many tasks are + * very light for example). Therefore impose a maximum. + */ + ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); @@ -5461,22 +5474,105 @@ unthrottle_throttle: resched_curr(rq); } -static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +#ifdef CONFIG_SMP +static void __cfsb_csd_unthrottle(void *arg) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cursor, *tmp; + struct rq *rq = arg; + struct rq_flags rf; + + rq_lock(rq, &rf); + + /* + * Since we hold rq lock we're safe from concurrent manipulation of + * the CSD list. However, this RCU critical section annotates the + * fact that we pair with sched_free_group_rcu(), so that we cannot + * race with group being freed in the window between removing it + * from the list and advancing to the next entry in the list. + */ + rcu_read_lock(); + + list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, + throttled_csd_list) { + list_del_init(&cursor->throttled_csd_list); + + if (cfs_rq_throttled(cursor)) + unthrottle_cfs_rq(cursor); + } + + rcu_read_unlock(); + + rq_unlock(rq, &rf); +} + +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + bool first; + + if (rq == this_rq()) { + unthrottle_cfs_rq(cfs_rq); + return; + } + + /* Already enqueued */ + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + return; + + first = list_empty(&rq->cfsb_csd_list); + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); + if (first) + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); +} +#else +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + unthrottle_cfs_rq(cfs_rq); +} +#endif + +static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + lockdep_assert_rq_held(rq_of(cfs_rq)); + + if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + cfs_rq->runtime_remaining <= 0)) + return; + + __unthrottle_cfs_rq_async(cfs_rq); +} + +static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +{ + struct cfs_rq *local_unthrottle = NULL; + int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; + bool throttled = false; + struct cfs_rq *cfs_rq; + struct rq_flags rf; + struct rq *rq; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { - struct rq *rq = rq_of(cfs_rq); - struct rq_flags rf; + rq = rq_of(cfs_rq); + + if (!remaining) { + throttled = true; + break; + } rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; - /* By the above check, this should never be true */ +#ifdef CONFIG_SMP + /* Already queued for async unthrottle */ + if (!list_empty(&cfs_rq->throttled_csd_list)) + goto next; +#endif + + /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); @@ -5490,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) cfs_rq->runtime_remaining += runtime; /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) - unthrottle_cfs_rq(cfs_rq); + if (cfs_rq->runtime_remaining > 0) { + if (cpu_of(rq) != this_cpu || + SCHED_WARN_ON(local_unthrottle)) + unthrottle_cfs_rq_async(cfs_rq); + else + local_unthrottle = cfs_rq; + } else { + throttled = true; + } next: rq_unlock_irqrestore(rq, &rf); - - if (!remaining) - break; } rcu_read_unlock(); + + if (local_unthrottle) { + rq = cpu_rq(this_cpu); + rq_lock_irqsave(rq, &rf); + if (cfs_rq_throttled(local_unthrottle)) + unthrottle_cfs_rq(local_unthrottle); + rq_unlock_irqrestore(rq, &rf); + } + + return throttled; } /* @@ -5544,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - distribute_cfs_runtime(cfs_b); + throttled = distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - - throttled = !list_empty(&cfs_b->throttled_cfs_rq); } /* @@ -5824,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_SMP + INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5840,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + int __maybe_unused i; + /* init_cfs_bandwidth() was not called */ if (!cfs_b->throttled_cfs_rq.next) return; hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); + + /* + * It is possible that we still have some cfs_rq's pending on a CSD + * list, though this race is very rare. In order for this to occur, we + * must have raced with the last task leaving the group while there + * exist throttled cfs_rq(s), and the period_timer must have queued the + * CSD item but the remote cpu has not yet processed it. To handle this, + * we can simply flush all pending CSD work inline here. We're + * guaranteed at this point that no additional cfs_rq of this group can + * join a CSD list. + */ +#ifdef CONFIG_SMP + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + if (list_empty(&rq->cfsb_csd_list)) + continue; + + local_irq_save(flags); + __cfsb_csd_unthrottle(rq); + local_irq_restore(flags); + } +#endif } /* @@ -6008,6 +6145,7 @@ static inline bool cpu_overutilized(int cpu) unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + /* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } @@ -6801,6 +6939,7 @@ static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { unsigned long task_util, util_min, util_max, best_cap = 0; + int fits, best_fits = 0; int cpu, best_cpu = -1; struct cpumask *cpus; @@ -6811,17 +6950,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (util_fits_cpu(task_util, util_min, util_max, cpu)) + + fits = util_fits_cpu(task_util, util_min, util_max, cpu); + + /* This CPU fits with all requirements */ + if (fits > 0) return cpu; + /* + * Only the min performance hint (i.e. uclamp_min) doesn't fit. + * Look for the CPU with best capacity. + */ + else if (fits < 0) + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); - if (cpu_cap > best_cap) { + /* + * First, select CPU which fits better (-1 being better than 0). + * Then, select the one with best capacity at same level. + */ + if ((fits < best_fits) || + ((fits == best_fits) && (cpu_cap > best_cap))) { best_cap = cpu_cap; best_cpu = cpu; + best_fits = fits; } } @@ -6834,7 +6989,11 @@ static inline bool asym_fits_cpu(unsigned long util, int cpu) { if (sched_asym_cpucap_active()) - return util_fits_cpu(util, util_min, util_max, cpu); + /* + * Return true only if the cpu fully fits the task requirements + * which include the utilization and the performance hints. + */ + return (util_fits_cpu(util, util_min, util_max, cpu) > 0); return true; } @@ -7201,6 +7360,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; + int prev_fits = -1, best_fits = -1; + unsigned long best_thermal_cap = 0; + unsigned long prev_thermal_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; @@ -7236,6 +7398,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) unsigned long prev_spare_cap = 0; int max_spare_cap_cpu = -1; unsigned long base_energy; + int fits, max_fits = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7285,7 +7448,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) util_min = max(rq_util_min, p_util_min); util_max = max(rq_util_max, p_util_max); } - if (!util_fits_cpu(util, util_min, util_max, cpu)) + + fits = util_fits_cpu(util, util_min, util_max, cpu); + if (!fits) continue; lsub_positive(&cpu_cap, util); @@ -7293,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ prev_spare_cap = cpu_cap; - } else if (cpu_cap > max_spare_cap) { + prev_fits = fits; + } else if ((fits > max_fits) || + ((fits == max_fits) && (cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -7301,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) */ max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; + max_fits = fits; } } @@ -7319,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; + prev_thermal_cap = cpu_thermal_cap; best_delta = min(best_delta, prev_delta); } /* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { + /* Current best energy cpu fits better */ + if (max_fits < best_fits) + continue; + + /* + * Both don't fit performance hint (i.e. uclamp_min) + * but best energy cpu has better capacity. + */ + if ((max_fits < 0) && + (cpu_thermal_cap <= best_thermal_cap)) + continue; + cur_delta = compute_energy(&eenv, pd, cpus, p, max_spare_cap_cpu); /* CPU utilization has changed */ if (cur_delta < base_energy) goto unlock; cur_delta -= base_energy; - if (cur_delta < best_delta) { - best_delta = cur_delta; - best_energy_cpu = max_spare_cap_cpu; - } + + /* + * Both fit for the task but best energy cpu has lower + * energy impact. + */ + if ((max_fits > 0) && (best_fits > 0) && + (cur_delta >= best_delta)) + continue; + + best_delta = cur_delta; + best_energy_cpu = max_spare_cap_cpu; + best_fits = max_fits; + best_thermal_cap = cpu_thermal_cap; } } rcu_read_unlock(); - if (best_delta < prev_delta) + if ((best_fits > prev_fits) || + ((best_fits > 0) && (best_delta < prev_delta)) || + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) target = best_energy_cpu; return target; @@ -8838,82 +9030,16 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity_orig = arch_scale_cpu_capacity(cpu); unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - struct rq *rq = cpu_rq(cpu); - rq->cpu_capacity_orig = capacity_orig; + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; - rq->cpu_capacity = capacity; - - /* - * Detect if the performance domain is in capacity inversion state. - * - * Capacity inversion happens when another perf domain with equal or - * lower capacity_orig_of() ends up having higher capacity than this - * domain after subtracting thermal pressure. - * - * We only take into account thermal pressure in this detection as it's - * the only metric that actually results in *real* reduction of - * capacity due to performance points (OPPs) being dropped/become - * unreachable due to thermal throttling. - * - * We assume: - * * That all cpus in a perf domain have the same capacity_orig - * (same uArch). - * * Thermal pressure will impact all cpus in this perf domain - * equally. - */ - if (sched_energy_enabled()) { - unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); - struct perf_domain *pd; - - rcu_read_lock(); - - pd = rcu_dereference(rq->rd->pd); - rq->cpu_capacity_inverted = 0; - - for (; pd; pd = pd->next) { - struct cpumask *pd_span = perf_domain_span(pd); - unsigned long pd_cap_orig, pd_cap; - - /* We can't be inverted against our own pd */ - if (cpumask_test_cpu(cpu_of(rq), pd_span)) - continue; - - cpu = cpumask_any(pd_span); - pd_cap_orig = arch_scale_cpu_capacity(cpu); - - if (capacity_orig < pd_cap_orig) - continue; - - /* - * handle the case of multiple perf domains have the - * same capacity_orig but one of them is under higher - * thermal pressure. We record it as capacity - * inversion. - */ - if (capacity_orig == pd_cap_orig) { - pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); - - if (pd_cap > inv_cap) { - rq->cpu_capacity_inverted = inv_cap; - break; - } - } else if (pd_cap_orig > inv_cap) { - rq->cpu_capacity_inverted = inv_cap; - break; - } - } - - rcu_read_unlock(); - } - - trace_sched_cpu_capacity_tp(rq); + cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; @@ -10141,24 +10267,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } - - local = &sds.local_stat; - busiest = &sds.busiest_stat; - /* There is no busy sibling group to pull tasks from */ if (!sds.busiest) goto out_balanced; + busiest = &sds.busiest_stat; + /* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance; + if (sched_energy_enabled()) { + struct root_domain *rd = env->dst_rq->rd; + + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + goto out_balanced; + } + /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) goto force_balance; @@ -10171,6 +10296,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; + local = &sds.local_stat; /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -11734,7 +11860,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) /* * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. */ -static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, + bool forceidle) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -11759,11 +11886,12 @@ void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) se_fi_update(se, rq->core->core_forceidle_seq, in_fi); } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) { struct rq *rq = task_rq(a); - struct sched_entity *sea = &a->se; - struct sched_entity *seb = &b->se; + const struct sched_entity *sea = &a->se; + const struct sched_entity *seb = &b->se; struct cfs_rq *cfs_rqa; struct cfs_rq *cfs_rqb; s64 delta; @@ -12480,6 +12608,11 @@ __init void init_sched_fair_class(void) for_each_possible_cpu(i) { zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + +#ifdef CONFIG_CFS_BANDWIDTH + INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); + INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); +#endif } open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |