diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 366 |
1 files changed, 214 insertions, 152 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7c535eee0a6..e9866f86f304 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5,37 +5,14 @@ * * Copyright (C) 1991-2002 Linus Torvalds */ -#include <linux/sched.h> -#include <linux/sched/clock.h> -#include <uapi/linux/sched/types.h> -#include <linux/sched/loadavg.h> -#include <linux/sched/hotplug.h> -#include <linux/wait_bit.h> -#include <linux/cpuset.h> -#include <linux/delayacct.h> -#include <linux/init_task.h> -#include <linux/context_tracking.h> -#include <linux/rcupdate_wait.h> -#include <linux/compat.h> - -#include <linux/blkdev.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/module.h> -#include <linux/nmi.h> -#include <linux/prefetch.h> -#include <linux/profile.h> -#include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/sched/isolation.h> +#include "sched.h" + +#include <linux/kthread.h> +#include <linux/nospec.h> #include <asm/switch_to.h> #include <asm/tlb.h> -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#endif -#include "sched.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -135,7 +112,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * [L] ->on_rq * RELEASE (rq->lock) * - * If we observe the old cpu in task_rq_lock, the acquire of + * If we observe the old CPU in task_rq_lock, the acquire of * the old rq->lock will fully serialize against the stores. * * If we observe the new CPU in task_rq_lock, the acquire will @@ -333,7 +310,7 @@ void hrtick_start(struct rq *rq, u64 delay) } #endif /* CONFIG_SMP */ -static void init_rq_hrtick(struct rq *rq) +static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP rq->hrtick_csd_pending = 0; @@ -351,7 +328,7 @@ static inline void hrtick_clear(struct rq *rq) { } -static inline void init_rq_hrtick(struct rq *rq) +static inline void hrtick_rq_init(struct rq *rq) { } #endif /* CONFIG_SCHED_HRTICK */ @@ -609,7 +586,7 @@ static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) return false; if (idle_cpu(cpu) && !need_resched()) @@ -619,7 +596,7 @@ static inline bool got_nohz_idle_kick(void) * We can't run Idle Load Balance on this CPU for this time so we * cancel it and clear NOHZ_BALANCE_KICK */ - clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); return false; } @@ -900,10 +877,37 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * this case, we can save a useless back to back clock update. */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) - rq_clock_skip_update(rq, true); + rq_clock_skip_update(rq); } #ifdef CONFIG_SMP + +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} + +/* + * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). + */ +static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +{ + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + return false; + + if (is_per_cpu_kthread(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +} + /* * This is how migration works: * @@ -961,16 +965,8 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (p->flags & PF_KTHREAD) { - if (unlikely(!cpu_online(dest_cpu))) - return rq; - } else { - if (unlikely(!cpu_active(dest_cpu))) - return rq; - } - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!is_cpu_allowed(p, dest_cpu)) return rq; update_rq_clock(rq); @@ -1457,7 +1453,7 @@ EXPORT_SYMBOL_GPL(kick_process); * * - cpu_active must be a subset of cpu_online * - * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, * see __set_cpus_allowed_ptr(). At this point the newly online * CPU isn't yet part of the sched domains, and balancing will not * see it. @@ -1499,10 +1495,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, &p->cpus_allowed) { - if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) - continue; - if (!cpu_online(dest_cpu)) + if (!is_cpu_allowed(p, dest_cpu)) continue; + goto out; } @@ -1565,8 +1560,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_online(cpu))) + if (unlikely(!is_cpu_allowed(p, cpu))) cpu = select_fallback_rq(task_cpu(p), p); return cpu; @@ -2200,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif -#ifdef CONFIG_NUMA_BALANCING - if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - p->mm->numa_scan_seq = 0; - } - - if (clone_flags & CLONE_VM) - p->numa_preferred_nid = current->numa_preferred_nid; - else - p->numa_preferred_nid = -1; - - p->node_stamp = 0ULL; - p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - p->numa_group = NULL; -#endif /* CONFIG_NUMA_BALANCING */ + init_numa_balancing(clone_flags, p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS -static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; +static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); void preempt_notifier_inc(void) { - static_key_slow_inc(&preempt_notifier_key); + static_branch_inc(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_inc); void preempt_notifier_dec(void) { - static_key_slow_dec(&preempt_notifier_key); + static_branch_dec(&preempt_notifier_key); } EXPORT_SYMBOL_GPL(preempt_notifier_dec); @@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec); */ void preempt_notifier_register(struct preempt_notifier *notifier) { - if (!static_key_false(&preempt_notifier_key)) + if (!static_branch_unlikely(&preempt_notifier_key)) WARN(1, "registering preempt_notifier while notifiers disabled\n"); hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); @@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { - if (static_key_false(&preempt_notifier_key)) + if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_in_preempt_notifiers(curr); } @@ -2555,7 +2529,7 @@ static __always_inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { - if (static_key_false(&preempt_notifier_key)) + if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_out_preempt_notifiers(curr, next); } @@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) raw_spin_unlock_irq(&rq->lock); } +/* + * NOP if the arch has not defined these: + */ + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif + +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2732,20 +2718,28 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state == TASK_DEAD)) { - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { + switch (prev_state) { + case TASK_DEAD: + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + + /* Task is done with its stack. */ + put_task_stack(prev); - /* Task is done with its stack. */ - put_task_stack(prev); + put_task_struct(prev); + break; - put_task_struct(prev); + case TASK_PARKED: + kthread_park_complete(prev); + break; + } } tick_nohz_task_switch(); @@ -3037,7 +3031,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* - * 64-bit doesn't need locks to atomically read a 64bit value. + * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. * Reading ->on_cpu is racy, but this is ok. * @@ -3096,35 +3090,99 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); #endif - rq_last_tick_reset(rq); } #ifdef CONFIG_NO_HZ_FULL -/** - * scheduler_tick_max_deferment - * - * Keep at least one tick per second when a single - * active task is running because the scheduler doesn't - * yet completely support full dynticks environment. - * - * This makes sure that uptime, CFS vruntime, load - * balancing, etc... continue to move forward, even - * with a very low granularity. - * - * Return: Maximum deferment in nanoseconds. - */ -u64 scheduler_tick_max_deferment(void) + +struct tick_work { + int cpu; + struct delayed_work work; +}; + +static struct tick_work __percpu *tick_work_cpu; + +static void sched_tick_remote(struct work_struct *work) { - struct rq *rq = this_rq(); - unsigned long next, now = READ_ONCE(jiffies); + struct delayed_work *dwork = to_delayed_work(work); + struct tick_work *twork = container_of(dwork, struct tick_work, work); + int cpu = twork->cpu; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; - next = rq->last_sched_tick + HZ; + /* + * Handle the tick only if it appears the remote CPU is running in full + * dynticks mode. The check is racy by nature, but missing a tick or + * having one too much is no big deal because the scheduler tick updates + * statistics and checks timeslices in a time-independent way, regardless + * of when exactly it is running. + */ + if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { + struct task_struct *curr; + u64 delta; - if (time_before_eq(next, now)) - return 0; + rq_lock_irq(rq, &rf); + update_rq_clock(rq); + curr = rq->curr; + delta = rq_clock_task(rq) - curr->se.exec_start; - return jiffies_to_nsecs(next - now); + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + rq_unlock_irq(rq, &rf); + } + + /* + * Run the remote tick once per second (1Hz). This arbitrary + * frequency is large enough to avoid overload but short enough + * to keep scheduler internal stats reasonably up to date. + */ + queue_delayed_work(system_unbound_wq, dwork, HZ); } + +static void sched_tick_start(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sched_tick_stop(int cpu) +{ + struct tick_work *twork; + + if (housekeeping_cpu(cpu, HK_FLAG_TICK)) + return; + + WARN_ON_ONCE(!tick_work_cpu); + + twork = per_cpu_ptr(tick_work_cpu, cpu); + cancel_delayed_work_sync(&twork->work); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int __init sched_tick_offload_init(void) +{ + tick_work_cpu = alloc_percpu(struct tick_work); + BUG_ON(!tick_work_cpu); + + return 0; +} + +#else /* !CONFIG_NO_HZ_FULL */ +static inline void sched_tick_start(int cpu) { } +static inline void sched_tick_stop(int cpu) { } #endif #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -3448,23 +3506,8 @@ static void __sched notrace __schedule(bool preempt) void __noreturn do_task_dead(void) { - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - raw_spin_lock_irq(¤t->pi_lock); - raw_spin_unlock_irq(¤t->pi_lock); - /* Causes final put_task_struct in finish_task_switch(): */ - __set_current_state(TASK_DEAD); + set_special_state(TASK_DEAD); /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; @@ -3987,6 +4030,23 @@ int idle_cpu(int cpu) } /** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int available_idle_cpu(int cpu) +{ + if (!idle_cpu(cpu)) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + +/** * idle_task - return the idle task for a given CPU. * @cpu: the processor in question. * @@ -4892,7 +4952,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, * * Return: 0. */ -SYSCALL_DEFINE0(sched_yield) +static void do_sched_yield(void) { struct rq_flags rf; struct rq *rq; @@ -4913,7 +4973,11 @@ SYSCALL_DEFINE0(sched_yield) sched_preempt_enable_no_resched(); schedule(); +} +SYSCALL_DEFINE0(sched_yield) +{ + do_sched_yield(); return 0; } @@ -4958,20 +5022,6 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - /** * yield - yield the current processor to other threads. * @@ -4997,7 +5047,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); void __sched yield(void) { set_current_state(TASK_RUNNING); - sys_sched_yield(); + do_sched_yield(); } EXPORT_SYMBOL(yield); @@ -5506,6 +5556,7 @@ void idle_task_exit(void) if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + current->active_mm = &init_mm; finish_arch_post_lock_switch(); } mmdrop(mm); @@ -5786,6 +5837,7 @@ int sched_cpu_starting(unsigned int cpu) { set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); + sched_tick_start(cpu); return 0; } @@ -5797,6 +5849,7 @@ int sched_cpu_dying(unsigned int cpu) /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); + sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); if (rq->rd) { @@ -5809,7 +5862,7 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(cpu); + nohz_balance_exit_idle(rq); hrtick_clear(rq); return 0; } @@ -6022,13 +6075,11 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON rq->last_load_update_tick = jiffies; - rq->nohz_flags = 0; -#endif -#ifdef CONFIG_NO_HZ_FULL - rq->last_sched_tick = 0; + rq->last_blocked_load_update_tick = jiffies; + atomic_set(&rq->nohz_flags, 0); #endif #endif /* CONFIG_SMP */ - init_rq_hrtick(rq); + hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); } @@ -6683,13 +6734,18 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) parent_quota = parent_b->hierarchical_quota; /* - * Ensure max(child_quota) <= parent_quota, inherit when no + * Ensure max(child_quota) <= parent_quota. On cgroup2, + * always take the min. On cgroup1, only inherit when no * limit is set: */ - if (quota == RUNTIME_INF) - quota = parent_quota; - else if (parent_quota != RUNTIME_INF && quota > parent_quota) - return -EINVAL; + if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { + quota = min(quota, parent_quota); + } else { + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } } cfs_b->hierarchical_quota = quota; @@ -6868,11 +6924,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 nice) { unsigned long weight; + int idx; if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; - weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; + idx = array_index_nospec(idx, 40); + weight = sched_prio_to_weight[idx]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); } #endif @@ -7022,3 +7082,5 @@ const u32 sched_prio_to_wmult[40] = { /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; + +#undef CREATE_TRACE_POINTS |