summaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2020-11-10 18:39:04 +0100
committerPeter Zijlstra <peterz@infradead.org>2020-11-10 18:39:04 +0100
commit12fa97c64dce2f3c2e6eed5dc618bb9046e40bf0 (patch)
treeab8c2f7728b8fa678fdbca80d5e0206edf3a51bc /kernel/sched/core.c
parentb6d37a764a5b852db63101b3f2db0e699574b903 (diff)
parentc777d847107e80df24dae87fc9cf4b4c0bf4dfed (diff)
downloadlinux-stable-12fa97c64dce2f3c2e6eed5dc618bb9046e40bf0.tar.gz
linux-stable-12fa97c64dce2f3c2e6eed5dc618bb9046e40bf0.tar.bz2
linux-stable-12fa97c64dce2f3c2e6eed5dc618bb9046e40bf0.zip
Merge branch 'sched/migrate-disable'
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c920
1 files changed, 739 insertions, 181 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6f533bb7d3b9..622f343413a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1696,6 +1696,80 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT_RT
+
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+ if (likely(!p->migration_disabled))
+ return;
+
+ if (p->cpus_ptr != &p->cpus_mask)
+ return;
+
+ /*
+ * Violates locking rules! see comment in __do_set_cpus_allowed().
+ */
+ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+ p->migration_disabled++;
+ return;
+ }
+
+ preempt_disable();
+ this_rq()->nr_pinned++;
+ p->migration_disabled = 1;
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
+ }
+
+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ preempt_disable();
+ if (p->cpus_ptr != &p->cpus_mask)
+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
+ barrier();
+ p->migration_disabled = 0;
+ this_rq()->nr_pinned--;
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned;
+}
+
+#endif
+
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1705,7 +1779,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
- if (is_per_cpu_kthread(p))
+ if (is_per_cpu_kthread(p) || is_migration_disabled(p))
return cpu_online(cpu);
return cpu_active(cpu);
@@ -1750,8 +1824,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
}
struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+struct set_affinity_pending {
+ refcount_t refs;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
};
/*
@@ -1783,16 +1865,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
+ struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
+ int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
+ bool complete = false;
struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
- local_irq_disable();
+ local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1802,21 +1887,126 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
+
+ pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
+ if (is_migration_disabled(p))
+ goto out;
+
+ if (pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+
+ /* migrate_enable() -- we must not race against SCA */
+ if (dest_cpu < 0) {
+ /*
+ * When this was migrate_enable() but we no longer
+ * have a @pending, a concurrent SCA 'fixed' things
+ * and we should be valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+ goto out;
+ }
+
+ dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+ }
+
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, dest_cpu);
else
- p->wake_cpu = arg->dest_cpu;
+ p->wake_cpu = dest_cpu;
+
+ } else if (dest_cpu < 0) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * When this was migrate_enable() but we no longer have an
+ * @pending, a concurrent SCA 'fixed' things and we should be
+ * valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ return 0;
}
- rq_unlock(rq, &rf);
- raw_spin_unlock(&p->pi_lock);
+out:
+ task_rq_unlock(rq, p, &rf);
+
+ if (complete)
+ complete_all(&pending->done);
+
+ /* For pending->{arg,stop_work} */
+ pending = arg->pending;
+ if (pending && refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);
- local_irq_enable();
+ return 0;
+}
+
+int push_cpu_stop(void *arg)
+{
+ struct rq *lowest_rq = NULL, *rq = this_rq();
+ struct task_struct *p = arg;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+
+ if (task_rq(p) != rq)
+ goto out_unlock;
+
+ if (is_migration_disabled(p)) {
+ p->migration_flags |= MDF_PUSH;
+ goto out_unlock;
+ }
+
+ p->migration_flags &= ~MDF_PUSH;
+
+ if (p->sched_class->find_lock_rq)
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+ if (!lowest_rq)
+ goto out_unlock;
+
+ // XXX validate p is still the highest prio task
+ if (task_rq(p) == rq) {
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, lowest_rq->cpu);
+ activate_task(lowest_rq, p, 0);
+ resched_curr(lowest_rq);
+ }
+
+ double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+ rq->push_busy = false;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
return 0;
}
@@ -1824,18 +2014,39 @@ static int migration_cpu_stop(void *data)
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
+ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = new_mask;
+ return;
+ }
+
cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
struct rq *rq = task_rq(p);
bool queued, running;
- lockdep_assert_held(&p->pi_lock);
+ /*
+ * This here violates the locking rules for affinity, since we're only
+ * supposed to change these variables while holding both rq->lock and
+ * p->pi_lock.
+ *
+ * HOWEVER, it magically works, because ttwu() is the only code that
+ * accesses these variables under p->pi_lock and only does so after
+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+ * before finish_task().
+ *
+ * XXX do further audits, this smells like something putrid.
+ */
+ if (flags & SCA_MIGRATE_DISABLE)
+ SCHED_WARN_ON(!p->on_cpu);
+ else
+ lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
@@ -1851,7 +2062,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (running)
put_prev_task(rq, p);
- p->sched_class->set_cpus_allowed(p, new_mask);
+ p->sched_class->set_cpus_allowed(p, new_mask, flags);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1859,6 +2070,208 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
set_next_task(rq, p);
}
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ __do_set_cpus_allowed(p, new_mask, 0);
+}
+
+/*
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * <resumes>
+ * migrate_enable();
+ * __set_cpus_allowed_ptr();
+ * <wakes local stopper>
+ * `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * cancels the need for an active migration. Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * P0@CPU0 P1 P2
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * set_cpus_allowed_ptr(P0, [0, 1]);
+ * <signal completion>
+ * <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded by an uninstallation of
+ * p->migration_pending done with p->pi_lock held.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+ int dest_cpu, unsigned int flags)
+{
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
+ struct migration_arg arg = {
+ .task = p,
+ .dest_cpu = dest_cpu,
+ };
+ bool complete = false;
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ struct task_struct *push_task = NULL;
+
+ if ((flags & SCA_MIGRATE_ENABLE) &&
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+ rq->push_busy = true;
+ push_task = get_task_struct(p);
+ }
+
+ pending = p->migration_pending;
+ if (pending) {
+ refcount_inc(&pending->refs);
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ task_rq_unlock(rq, p, rf);
+
+ if (push_task) {
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ p, &rq->push_work);
+ }
+
+ if (complete)
+ goto do_complete;
+
+ return 0;
+ }
+
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ /* serialized by p->pi_lock */
+ if (!p->migration_pending) {
+ /* Install the request */
+ refcount_set(&my_pending.refs, 1);
+ init_completion(&my_pending.done);
+ p->migration_pending = &my_pending;
+ } else {
+ pending = p->migration_pending;
+ refcount_inc(&pending->refs);
+ }
+ }
+ pending = p->migration_pending;
+ /*
+ * - !MIGRATE_ENABLE:
+ * we'll have installed a pending if there wasn't one already.
+ *
+ * - MIGRATE_ENABLE:
+ * we're here because the current CPU isn't matching anymore,
+ * the only way that can happen is because of a concurrent
+ * set_cpus_allowed_ptr() call, which should then still be
+ * pending completion.
+ *
+ * Either way, we really should have a @pending here.
+ */
+ if (WARN_ON_ONCE(!pending)) {
+ task_rq_unlock(rq, p, rf);
+ return -EINVAL;
+ }
+
+ if (flags & SCA_MIGRATE_ENABLE) {
+
+ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+ p->migration_flags &= ~MDF_PUSH;
+ task_rq_unlock(rq, p, rf);
+
+ pending->arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1,
+ .pending = pending,
+ };
+
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+
+ return 0;
+ }
+
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ /*
+ * Lessen races (and headaches) by delegating
+ * is_migration_disabled(p) checks to the stopper, which will
+ * run on the same CPU as said p.
+ */
+ task_rq_unlock(rq, p, rf);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+ } else {
+
+ if (!is_migration_disabled(p)) {
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ task_rq_unlock(rq, p, rf);
+
+do_complete:
+ if (complete)
+ complete_all(&pending->done);
+ }
+
+ wait_for_completion(&pending->done);
+
+ if (refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);
+
+ /*
+ * Block the original owner of &pending until all subsequent callers
+ * have seen the completion and decremented the refcount
+ */
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+ return 0;
+}
+
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
@@ -1869,7 +2282,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* call is not atomic; no spinlocks may be held.
*/
static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+ const struct cpumask *new_mask,
+ u32 flags)
{
const struct cpumask *cpu_valid_mask = cpu_active_mask;
unsigned int dest_cpu;
@@ -1880,9 +2294,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
- if (p->flags & PF_KTHREAD) {
+ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
/*
- * Kernel threads are allowed on online && !active CPUs
+ * Kernel threads are allowed on online && !active CPUs.
+ *
+ * Specifically, migration_disabled() tasks must not fail the
+ * cpumask_any_and_distribute() pick below, esp. so on
+ * SCA_MIGRATE_ENABLE, otherwise we'll not call
+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
*/
cpu_valid_mask = cpu_online_mask;
}
@@ -1891,13 +2310,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
- if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
- if (cpumask_equal(&p->cpus_mask, new_mask))
- goto out;
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ if (cpumask_equal(&p->cpus_mask, new_mask))
+ goto out;
+
+ if (WARN_ON_ONCE(p == current &&
+ is_migration_disabled(p) &&
+ !cpumask_test_cpu(task_cpu(p), new_mask))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
/*
* Picking a ~random cpu helps in cases where we are changing affinity
@@ -1910,7 +2338,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- do_set_cpus_allowed(p, new_mask);
+ __do_set_cpus_allowed(p, new_mask, flags);
if (p->flags & PF_KTHREAD) {
/*
@@ -1922,23 +2350,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
p->nr_cpus_allowed != 1);
}
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
+ return affine_move_task(rq, p, &rf, dest_cpu, flags);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- return 0;
- } else if (task_on_rq_queued(p)) {
- /*
- * OK, since we're going to drop the lock immediately
- * afterwards anyway.
- */
- rq = move_queued_task(rq, &rf, p, dest_cpu);
- }
out:
task_rq_unlock(rq, p, &rf);
@@ -1947,7 +2360,7 @@ out:
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, false);
+ return __set_cpus_allowed_ptr(p, new_mask, 0);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
@@ -1988,6 +2401,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
*/
WARN_ON_ONCE(!cpu_online(new_cpu));
+
+ WARN_ON_ONCE(is_migration_disabled(p));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -2318,6 +2733,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
+ /*
+ * XXX When called from select_task_rq() we only
+ * hold p->pi_lock and again violate locking order.
+ *
+ * More yuck to audit.
+ */
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
break;
@@ -2352,7 +2773,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (p->nr_cpus_allowed > 1)
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
cpu = cpumask_any(p->cpus_ptr);
@@ -2375,6 +2796,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
+ static struct lock_class_key stop_pi_lock;
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
struct task_struct *old_stop = cpu_rq(cpu)->stop;
@@ -2390,6 +2812,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
stop->sched_class = &stop_sched_class;
+
+ /*
+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+ * adjust the effective priority of a task. As a result,
+ * rt_mutex_setprio() can trigger (RT) balancing operations,
+ * which can then trigger wakeups of the stop thread to push
+ * around the current task.
+ *
+ * The stop task itself will never be part of the PI-chain, it
+ * never blocks, therefore that ->pi_lock recursion is safe.
+ * Tell lockdep about this by placing the stop->pi_lock in its
+ * own class.
+ */
+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
}
cpu_rq(cpu)->stop = stop;
@@ -2406,13 +2842,25 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
#else
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+ const struct cpumask *new_mask,
+ u32 flags)
{
return set_cpus_allowed_ptr(p, new_mask);
}
#endif /* CONFIG_SMP */
+#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return false;
+}
+
+#endif
+
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
@@ -3098,6 +3546,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ p->migration_pending = NULL;
#endif
}
@@ -3485,6 +3934,90 @@ static inline void finish_task(struct task_struct *prev)
#endif
}
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+ void (*func)(struct rq *rq);
+ struct callback_head *next;
+
+ lockdep_assert_held(&rq->lock);
+
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+
+ func(rq);
+ }
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ struct callback_head *head = rq->balance_callback;
+
+ lockdep_assert_held(&rq->lock);
+ if (head) {
+ rq->balance_callback = NULL;
+ rq->balance_flags &= ~BALANCE_WORK;
+ }
+
+ return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+ do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+ unsigned long flags;
+
+ if (unlikely(head)) {
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ do_balance_callbacks(rq, head);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+}
+
+static void balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+ if (likely(!rq->balance_flags))
+ return;
+
+ if (rq->balance_flags & BALANCE_PUSH) {
+ balance_push(rq);
+ return;
+ }
+
+ __balance_callbacks(rq);
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+static inline void balance_switch(struct rq *rq)
+{
+}
+
+#endif
+
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -3510,6 +4043,7 @@ static inline void finish_lock_switch(struct rq *rq)
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ balance_switch(rq);
raw_spin_unlock_irq(&rq->lock);
}
@@ -3651,43 +4185,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
return rq;
}
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
- struct callback_head *head, *next;
- void (*func)(struct rq *rq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- head = rq->balance_callback;
- rq->balance_callback = NULL;
- while (head) {
- func = (void (*)(struct rq *))head->func;
- next = head->next;
- head->next = NULL;
- head = next;
-
- func(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
- if (unlikely(rq->balance_callback))
- __balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -3707,7 +4204,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
rq = finish_task_switch(prev);
- balance_callback(rq);
preempt_enable();
if (current->set_child_tid)
@@ -4515,6 +5011,7 @@ static void __sched notrace __schedule(bool preempt)
*/
++*switch_count;
+ migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(preempt, prev, next);
@@ -4523,10 +5020,11 @@ static void __sched notrace __schedule(bool preempt)
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unlock_irq(rq, &rf);
- }
- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ }
}
void __noreturn do_task_dead(void)
@@ -4937,9 +5435,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
- __task_rq_unlock(rq, &rf);
- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_unlock(&rq->lock);
+
preempt_enable();
}
#else
@@ -5213,6 +5713,7 @@ static int __sched_setscheduler(struct task_struct *p,
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
const struct sched_class *prev_class;
+ struct callback_head *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5451,6 +5952,7 @@ change:
/* Avoid rq from going away on us: */
preempt_disable();
+ head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);
if (pi) {
@@ -5459,7 +5961,7 @@ change:
}
/* Run balance callbacks after we've adjusted the PI chain: */
- balance_callback(rq);
+ balance_callbacks(rq, head);
preempt_enable();
return 0;
@@ -5954,7 +6456,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -6443,6 +6945,7 @@ void sched_show_task(struct task_struct *p)
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
+ print_stop_info(KERN_INFO, p);
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
@@ -6533,7 +7036,7 @@ void init_idle(struct task_struct *idle, int cpu)
*
* And since this is boot we can forgo the serialization.
*/
- set_cpus_allowed_common(idle, cpumask_of(cpu));
+ set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
#endif
/*
* We're having a chicken and egg problem, even though we are
@@ -6684,119 +7187,126 @@ void idle_task_exit(void)
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
+static int __balance_push_cpu_stop(void *arg)
{
- long delta = calc_load_fold_active(rq, 1);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-}
+ struct task_struct *p = arg;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+ int cpu;
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
- const struct sched_class *class;
- struct task_struct *next;
+ raw_spin_lock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
- for_each_class(class) {
- next = class->pick_next_task(rq);
- if (next) {
- next->sched_class->put_prev_task(rq, next);
- return next;
- }
+ update_rq_clock(rq);
+
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ cpu = select_fallback_rq(rq->cpu, p);
+ rq = __migrate_task(rq, &rf, p, cpu);
}
- /* The idle class should always have a runnable task */
- BUG();
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
*/
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
+static void balance_push(struct rq *rq)
{
- struct rq *rq = dead_rq;
- struct task_struct *next, *stop = rq->stop;
- struct rq_flags orf = *rf;
- int dest_cpu;
+ struct task_struct *push_task = rq->curr;
+
+ lockdep_assert_held(&rq->lock);
+ SCHED_WARN_ON(rq->cpu != smp_processor_id());
/*
- * Fudge the rq selection such that the below task selection loop
- * doesn't get stuck on the currently eligible stop task.
- *
- * We're currently inside stop_machine() and the rq is either stuck
- * in the stop_machine_cpu_stop() loop, or we're executing this code,
- * either way we should never end up calling schedule() until we're
- * done here.
+ * Both the cpu-hotplug and stop task are in this case and are
+ * required to complete the hotplug process.
*/
- rq->stop = NULL;
+ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
+ /*
+ * If this is the idle task on the outgoing CPU try to wake
+ * up the hotplug control thread which might wait for the
+ * last task to vanish. The rcuwait_active() check is
+ * accurate here because the waiter is pinned on this CPU
+ * and can't obviously be running in parallel.
+ *
+ * On RT kernels this also has to check whether there are
+ * pinned and scheduled out tasks on the runqueue. They
+ * need to leave the migrate disabled section first.
+ */
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+ rcuwait_active(&rq->hotplug_wait)) {
+ raw_spin_unlock(&rq->lock);
+ rcuwait_wake_up(&rq->hotplug_wait);
+ raw_spin_lock(&rq->lock);
+ }
+ return;
+ }
+ get_task_struct(push_task);
/*
- * put_prev_task() and pick_next_task() sched
- * class method both need to have an up-to-date
- * value of rq->clock[_task]
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
+ * Both preemption and IRQs are still disabled.
*/
- update_rq_clock(rq);
+ raw_spin_unlock(&rq->lock);
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+ this_cpu_ptr(&push_work));
+ /*
+ * At this point need_resched() is true and we'll take the loop in
+ * schedule(). The next pick is obviously going to be the stop task
+ * which is_per_cpu_kthread() and will push this task away.
+ */
+ raw_spin_lock(&rq->lock);
+}
- for (;;) {
- /*
- * There's this thread running, bail when that's the only
- * remaining thread:
- */
- if (rq->nr_running == 1)
- break;
+static void balance_push_set(int cpu, bool on)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
- next = __pick_migrate_task(rq);
+ rq_lock_irqsave(rq, &rf);
+ if (on)
+ rq->balance_flags |= BALANCE_PUSH;
+ else
+ rq->balance_flags &= ~BALANCE_PUSH;
+ rq_unlock_irqrestore(rq, &rf);
+}
- /*
- * Rules for changing task_struct::cpus_mask are holding
- * both pi_lock and rq->lock, such that holding either
- * stabilizes the mask.
- *
- * Drop rq->lock is not quite as disastrous as it usually is
- * because !cpu_active at this point, which means load-balance
- * will not interfere. Also, stop-machine.
- */
- rq_unlock(rq, rf);
- raw_spin_lock(&next->pi_lock);
- rq_relock(rq, rf);
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+ struct rq *rq = this_rq();
- /*
- * Since we're inside stop-machine, _nothing_ should have
- * changed the task, WARN if weird stuff happened, because in
- * that case the above rq->lock drop is a fail too.
- */
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
- raw_spin_unlock(&next->pi_lock);
- continue;
- }
+ rcuwait_wait_event(&rq->hotplug_wait,
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
+ TASK_UNINTERRUPTIBLE);
+}
- /* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
- rq = __migrate_task(rq, rf, next, dest_cpu);
- if (rq != dead_rq) {
- rq_unlock(rq, rf);
- rq = dead_rq;
- *rf = orf;
- rq_relock(rq, rf);
- }
- raw_spin_unlock(&next->pi_lock);
- }
+#else
- rq->stop = stop;
+static inline void balance_push(struct rq *rq)
+{
}
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
+static inline void balance_hotplug_wait(void)
+{
+}
+
#endif /* CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
@@ -6882,6 +7392,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
+ balance_push_set(cpu, false);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
@@ -6917,6 +7429,8 @@ int sched_cpu_activate(unsigned int cpu)
int sched_cpu_deactivate(unsigned int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
int ret;
set_cpu_active(cpu, false);
@@ -6929,6 +7443,16 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
synchronize_rcu();
+ balance_push_set(cpu, true);
+
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ update_rq_clock(rq);
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
@@ -6942,6 +7466,7 @@ int sched_cpu_deactivate(unsigned int cpu)
ret = cpuset_cpu_inactive(cpu);
if (ret) {
+ balance_push_set(cpu, false);
set_cpu_active(cpu, true);
return ret;
}
@@ -6965,6 +7490,41 @@ int sched_cpu_starting(unsigned int cpu)
}
#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+ balance_hotplug_wait();
+ return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+ long delta = calc_load_fold_active(rq, 1);
+
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+}
+
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6974,12 +7534,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
- }
- migrate_tasks(rq, &rf);
- BUG_ON(rq->nr_running != 1);
+ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
@@ -7186,6 +7741,9 @@ void __init sched_init(void)
rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ rcuwait_init(&rq->hotplug_wait);
+#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);