summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/clock.c13
-rw-r--r--kernel/sched/completion.c31
-rw-r--r--kernel/sched/core.c271
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h2
-rw-r--r--kernel/sched/deadline.c87
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c9
-rw-r--r--kernel/sched/idle.c19
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h98
-rw-r--r--kernel/sched/stats.c11
14 files changed, 397 insertions, 206 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = -pg
+CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
* so we don't have to move tasks around upon policy change,
* or flail around trying to allocate bandwidth on the fly.
* A bandwidth exception in __sched_setscheduler() allows
- * the policy change to proceed. Thereafter, task_group()
- * returns &root_task_group, so zero bandwidth is required.
+ * the policy change to proceed.
*/
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
if (tg != &root_task_group)
return false;
- if (p->sched_class != &fair_sched_class)
- return false;
-
/*
* We can only assume the task group can't go away on us if
* autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
EXPORT_SYMBOL_GPL(cpu_clock);
EXPORT_SYMBOL_GPL(local_clock);
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+ return local_clock();
+}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 607f852b4d04..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -268,6 +268,15 @@ bool try_wait_for_completion(struct completion *x)
unsigned long flags;
int ret = 1;
+ /*
+ * Since x->done will need to be locked only
+ * in the non-blocking case, we check x->done
+ * first without taking the lock so we can
+ * return early in the blocking case.
+ */
+ if (!READ_ONCE(x->done))
+ return 0;
+
spin_lock_irqsave(&x->wait.lock, flags);
if (!x->done)
ret = 0;
@@ -288,13 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/
bool completion_done(struct completion *x)
{
- unsigned long flags;
- int ret = 1;
+ if (!READ_ONCE(x->done))
+ return false;
- spin_lock_irqsave(&x->wait.lock, flags);
- if (!x->done)
- ret = 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- return ret;
+ /*
+ * If ->done, we need to wait for complete() to release ->wait.lock
+ * otherwise we can end up freeing the completion before complete()
+ * is done referencing it.
+ *
+ * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+ * the loads of ->done and ->wait.lock such that we cannot observe
+ * the lock before complete() acquires it while observing the ->done
+ * after it's acquired the lock.
+ */
+ smp_rmb();
+ spin_unlock_wait(&x->wait.lock);
+ return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..f0f831e8a345 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;
- if (rq->skip_clock_update > 0)
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_skip_update & RQCF_ACT_SKIP)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -305,66 +307,6 @@ __read_mostly int scheduler_running;
int sysctl_sched_rt_runtime = 950000;
/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- lockdep_assert_held(&p->pi_lock);
-
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-static void __task_rq_unlock(struct rq *rq)
- __releases(rq->lock)
-{
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
- __releases(rq->lock)
- __releases(p->pi_lock)
-{
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
-/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
static struct rq *this_rq_lock(void)
@@ -490,6 +432,11 @@ static __init void init_hrtick(void)
*/
void hrtick_start(struct rq *rq, u64 delay)
{
+ /*
+ * Don't schedule slices shorter than 10000ns, that just
+ * doesn't make sense. Rely on vruntime for fairness.
+ */
+ delay = max_t(u64, delay, 10000LL);
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
@@ -1046,7 +993,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* this case, we can save a useless back to back clock update.
*/
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
- rq->skip_clock_update = 1;
+ rq_clock_skip_update(rq, true);
}
#ifdef CONFIG_SMP
@@ -1082,7 +1029,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+ perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
}
__set_task_cpu(p, new_cpu);
@@ -1814,6 +1761,10 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
/*
@@ -1832,6 +1783,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_SMP
+ p->se.avg.decay_count = 0;
+#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1793,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
RB_CLEAR_NODE(&p->dl.rb_node);
- hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ init_dl_task_timer(&p->dl);
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2003,9 @@ static inline int dl_bw_cpus(int i)
* allocated bandwidth to reflect the new situation.
*
* This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
*/
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
@@ -2748,6 +2705,10 @@ again:
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
*/
static void __sched __schedule(void)
{
@@ -2756,7 +2717,6 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
-need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -2776,6 +2736,8 @@ need_resched:
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2762,13 @@ need_resched:
switch_count = &prev->nvcsw;
}
- if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+ if (task_on_rq_queued(prev))
update_rq_clock(rq);
next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- rq->skip_clock_update = 0;
+ rq->clock_skip_update = 0;
if (likely(prev != next)) {
rq->nr_switches++;
@@ -2821,8 +2783,6 @@ need_resched:
post_schedule(rq);
sched_preempt_enable_no_resched();
- if (need_resched())
- goto need_resched;
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2802,9 @@ asmlinkage __visible void __sched schedule(void)
struct task_struct *tsk = current;
sched_submit_work(tsk);
- __schedule();
+ do {
+ __schedule();
+ } while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2877,6 +2839,21 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
+static void __sched notrace preempt_schedule_common(void)
+{
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ __schedule();
+ __preempt_count_sub(PREEMPT_ACTIVE);
+
+ /*
+ * Check again in case we missed a preemption opportunity
+ * between schedule and now.
+ */
+ barrier();
+ } while (need_resched());
+}
+
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2869,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
if (likely(!preemptible()))
return;
- do {
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
- } while (need_resched());
+ preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3218,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
- init_dl_task_timer(dl_se);
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
- dl_se->dl_throttled = 0;
- dl_se->dl_new = 1;
- dl_se->dl_yielded = 0;
+
+ /*
+ * Changing the parameters of a task is 'tricky' and we're not doing
+ * the correct thing -- also see task_dead_dl() and switched_from_dl().
+ *
+ * What we SHOULD do is delay the bandwidth release until the 0-lag
+ * point. This would include retaining the task_struct until that time
+ * and change dl_overflow() to not immediately decrement the current
+ * amount.
+ *
+ * Instead we retain the current runtime/deadline and let the new
+ * parameters take effect after the current reservation period lapses.
+ * This is safe (albeit pessimistic) because the 0-lag point is always
+ * before the current scheduling deadline.
+ *
+ * We can still have temporary overloads because we do not delay the
+ * change in bandwidth until that time; so admission control is
+ * not on the safe side. It does however guarantee tasks will never
+ * consume more than promised.
+ */
}
/*
@@ -3382,6 +3365,20 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+static bool dl_param_changed(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (dl_se->dl_runtime != attr->sched_runtime ||
+ dl_se->dl_deadline != attr->sched_deadline ||
+ dl_se->dl_period != attr->sched_period ||
+ dl_se->flags != attr->sched_flags)
+ return true;
+
+ return false;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user)
@@ -3510,7 +3507,7 @@ recheck:
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
- if (dl_policy(policy))
+ if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4199,10 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-static void __cond_resched(void)
-{
- __preempt_count_add(PREEMPT_ACTIVE);
- __schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
-}
-
int __sched _cond_resched(void)
{
if (should_resched()) {
- __cond_resched();
+ preempt_schedule_common();
return 1;
}
return 0;
@@ -4237,7 +4227,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
- __cond_resched();
+ preempt_schedule_common();
else
cpu_relax();
ret = 1;
@@ -4253,7 +4243,7 @@ int __sched __cond_resched_softirq(void)
if (should_resched()) {
local_bh_enable();
- __cond_resched();
+ preempt_schedule_common();
local_bh_disable();
return 1;
}
@@ -4368,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
-void __sched io_schedule(void)
-{
- struct rq *rq = raw_rq();
-
- delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
- schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = raw_rq();
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
long ret;
+ current->in_iowait = 1;
+ if (old_iowait)
+ blk_schedule_flush_plug(current);
+ else
+ blk_flush_plug(current);
+
delayacct_blkio_start();
+ rq = raw_rq();
atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
+
return ret;
}
+EXPORT_SYMBOL(io_schedule_timeout);
/**
* sys_sched_get_priority_max - return maximum RT priority.
@@ -4508,9 +4491,10 @@ void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned state;
+ unsigned long state = p->state;
- state = p->state ? __ffs(p->state) + 1 : 0;
+ if (state)
+ state = __ffs(state) + 1;
printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
@@ -4642,6 +4626,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
struct dl_bw *cur_dl_b;
unsigned long flags;
+ if (!cpumask_weight(cur))
+ return ret;
+
rcu_read_lock_sched();
cur_dl_b = dl_bw_of(cpumask_any(cur));
trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4727,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class && p->sched_class->set_cpus_allowed)
+ if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask);
cpumask_copy(&p->cpus_allowed, new_mask);
@@ -5408,9 +5395,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
- char str[256];
- cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5423,7 +5408,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %s level %s\n", str, sd->name);
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5468,9 +5454,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group));
- cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-
- printk(KERN_CONT " %s", str);
+ printk(KERN_CONT " %*pbl",
+ cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
printk(KERN_CONT " (cpu_capacity = %d)",
group->sgc->capacity);
@@ -7250,6 +7235,11 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
+ * During early bootup we pretend to be a normal task:
+ */
+ current->sched_class = &fair_sched_class;
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -7259,11 +7249,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
- /*
- * During early bootup we pretend to be a normal task:
- */
- current->sched_class = &fair_sched_class;
-
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
@@ -7292,13 +7277,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
* since we will exit with TASK_RUNNING make sure we enter with it,
* otherwise we will destroy state.
*/
- if (WARN_ONCE(current->state != TASK_RUNNING,
+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
"do not call blocking ops when !TASK_RUNNING; "
"state=%lx set at [<%p>] %pS\n",
current->state,
(void *)current->task_state_change,
- (void *)current->task_state_change))
- __set_current_state(TASK_RUNNING);
+ (void *)current->task_state_change);
___might_sleep(file, line, preempt_offset);
}
@@ -7325,6 +7309,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
in_atomic(), irqs_disabled(),
current->pid, current->comm);
+ if (task_stack_end_corrupted(current))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
@@ -7588,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
for_each_process_thread(g, p) {
if (rt_task(p) && task_group(p) == tg)
return 1;
@@ -7680,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
{
int i, err = 0;
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7736,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (rt_period == 0)
- return -EINVAL;
-
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
- if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
+ if (later_mask &&
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
best_cpu = cpumask_any(later_mask);
goto out;
} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
}
/*
+ * cpudl_set_freecpu - Set the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_set_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_set_cpu(cpu, cp->free_cpus);
+}
+
+/*
+ * cpudl_clear_freecpu - Clear the cpudl.free_cpus
+ * @cp: the cpudl max-heap context
+ * @cpu: rd attached cpu
+ */
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
+{
+ cpumask_clear_cpu(cpu, cp->free_cpus);
+}
+
+/*
* cpudl_init - initialize the cpudl structure
* @cp: the cpudl max-heap context
*/
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
if (!cp->elements)
return -ENOMEM;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
kfree(cp->elements);
return -ENOMEM;
}
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
for_each_possible_cpu(i)
cp->elements[i].idx = IDX_INVALID;
- cpumask_setall(cp->free_cpus);
-
return 0;
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
+void cpudl_set_freecpu(struct cpudl *cp, int cpu);
+void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b52092f2636d..3fa8fa6d9403 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
+
+ if (dl_se->dl_yielded)
+ dl_se->dl_yielded = 0;
+ if (dl_se->dl_throttled)
+ dl_se->dl_throttled = 0;
}
/*
@@ -506,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity,
dl_timer);
struct task_struct *p = dl_task_of(dl_se);
+ unsigned long flags;
struct rq *rq;
-again:
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (rq != task_rq(p)) {
- /* Task was moved, retrying. */
- raw_spin_unlock(&rq->lock);
- goto again;
- }
+ rq = task_rq_lock(current, &flags);
/*
* We need to take care of several possible races here:
@@ -536,25 +535,41 @@ again:
sched_clock_tick();
update_rq_clock(rq);
- dl_se->dl_throttled = 0;
- dl_se->dl_yielded = 0;
- if (task_on_rq_queued(p)) {
- enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
+ }
+
+ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
- */
- if (has_pushable_dl_tasks(rq))
- push_dl_task(rq);
+ /*
+ * Queueing this task back might have overloaded rq,
+ * check if we need to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
#endif
- }
unlock:
- raw_spin_unlock(&rq->lock);
+ task_rq_unlock(rq, current, &flags);
return HRTIMER_NORESTART;
}
@@ -613,10 +628,9 @@ static void update_curr_dl(struct rq *rq)
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
+ dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
- dl_se->dl_throttled = 1;
- else
+ if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -853,7 +867,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
*/
- if (p->dl.dl_throttled)
+ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
return;
enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
+ update_rq_clock(rq);
update_curr_dl(rq);
}
@@ -1073,7 +1088,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
- if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+ /*
+ * Even when we have runtime, update_curr_dl() might have resulted in us
+ * not being the leftmost task anymore. In that case NEED_RESCHED will
+ * be set and schedule() will start a new hrtick for the next task.
+ */
+ if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
+ is_leftmost(p, &rq->dl))
start_hrtick_dl(rq, p);
}
@@ -1094,6 +1115,7 @@ static void task_dead_dl(struct task_struct *p)
* Since we are TASK_DEAD we won't slip out of the domain!
*/
raw_spin_lock_irq(&dl_b->lock);
+ /* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
@@ -1165,9 +1187,6 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- cpumask_copy(later_mask, task_rq(task)->rd->span);
- cpumask_and(later_mask, later_mask, cpu_active_mask);
- cpumask_and(later_mask, later_mask, &task->cpus_allowed);
best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
task, later_mask);
if (best_cpu == -1)
@@ -1562,6 +1581,7 @@ static void rq_online_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_set_overload(rq);
+ cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
}
@@ -1573,6 +1593,7 @@ static void rq_offline_dl(struct rq *rq)
dl_clear_overload(rq);
cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+ cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
}
void init_sched_dl_class(void)
@@ -1614,8 +1635,8 @@ static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
+ /* XXX we should retain the bw until 0-lag */
cancel_dl_timer(rq, p);
-
__dl_clear_params(p);
/*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do { \
PN(next_balance);
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
+ PN(clock_task);
P(cpu_load[0]);
P(cpu_load[1]);
P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cbf371b..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
{
u32 slice;
- p->se.avg.decay_count = 0;
slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
p->se.avg.runnable_avg_sum = slice;
p->se.avg.runnable_avg_period = slice;
@@ -1730,7 +1729,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
nodes = node_online_map;
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
- nodemask_t max_group;
+ nodemask_t max_group = NODE_MASK_NONE;
int a, b;
/* Are there nodes at this distance from each other? */
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
u64 decays = atomic64_read(&cfs_rq->decay_counter);
decays -= se->avg.decay_count;
+ se->avg.decay_count = 0;
if (!decays)
return 0;
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.decay_count = 0;
return decays;
}
@@ -5157,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
* so we don't do microscopic update in schedule()
* and double the fastpath cost.
*/
- rq->skip_clock_update = 1;
+ rq_clock_skip_update(rq, true);
}
set_skip_buddy(se);
@@ -5949,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
*/
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
+ delta = __rq_clock_broken(rq) - age_stamp;
- delta = rq_clock(rq) - age_stamp;
if (unlikely(delta < 0))
delta = 0;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..94b2d7b88a27 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
+#include <linux/suspend.h>
#include <asm/tlb.h>
@@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
- while (!tif_need_resched())
+ while (!tif_need_resched() &&
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
@@ -104,6 +106,21 @@ static void cpuidle_idle_call(void)
rcu_idle_enter();
/*
+ * Suspend-to-idle ("freeze") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
+ */
+ if (idle_should_freeze()) {
+ cpuidle_enter_freeze();
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ /*
* Ask the cpuidle framework to choose a convenient idle state.
* Fall back to the default arch idle method on errors.
*/
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
enqueue = 1;
/*
- * Force a clock update if the CPU was idle,
- * lest wakeup -> unthrottle time accumulate.
+ * When we're idle and a woken (rt) task is
+ * throttled check_preempt_curr() will set
+ * skip_update and the time between the wakeup
+ * and this unthrottle will get accounted as
+ * 'runtime'.
*/
if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- rq->skip_clock_update = -1;
+ rq_clock_skip_update(rq, false);
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
- if (target != -1)
+ /*
+ * Don't bother moving it if the destination CPU is
+ * not running a lower priority task.
+ */
+ if (target != -1 &&
+ p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
lowest_rq = cpu_rq(cpu);
+ if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+ /*
+ * Target rq has tasks of equal or higher priority,
+ * retrying does not release any lock and is unlikely
+ * to yield a different result.
+ */
+ lowest_rq = NULL;
+ break;
+ }
+
/* if the prio of this runqueue changed, try again */
if (double_lock_balance(rq, lowest_rq)) {
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..dc0f435a2779 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
- int skip_clock_update;
-
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
unsigned long next_balance;
struct mm_struct *prev_mm;
+ unsigned int clock_skip_update;
u64 clock;
u64 clock_task;
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return ACCESS_ONCE(rq->clock);
+}
+
static inline u64 rq_clock(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
+ lockdep_assert_held(&rq->lock);
return rq->clock_task;
}
+#define RQCF_REQ_SKIP 0x01
+#define RQCF_ACT_SKIP 0x02
+
+static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+{
+ lockdep_assert_held(&rq->lock);
+ if (skip)
+ rq->clock_skip_update |= RQCF_REQ_SKIP;
+ else
+ rq->clock_skip_update &= ~RQCF_REQ_SKIP;
+}
+
#ifdef CONFIG_NUMA
enum numa_topology_type {
NUMA_DIRECT,
@@ -1362,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index a476bea17fbc..87e2c9f0c33e 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -15,11 +15,6 @@
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
- int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
- char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-
- if (mask_str == NULL)
- return -ENOMEM;
if (v == (void *)1) {
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
@@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- cpumask_scnprintf(mask_str, mask_len,
- sched_domain_span(sd));
- seq_printf(seq, "domain%d %s", dcount++, mask_str);
+ seq_printf(seq, "domain%d %*pb", dcount++,
+ cpumask_pr_args(sched_domain_span(sd)));
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
itype++) {
seq_printf(seq, " %u %u %u %u %u %u %u %u",
@@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
rcu_read_unlock();
#endif
}
- kfree(mask_str);
return 0;
}