From 63f01241176d7cbc976385aec32f0a209b0bc36a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 14:48:10 -0500 Subject: sched: Remove unlikely() from rt_policy() in sched.c The rt_policy() has an unlikely() that the policy it is checking is of RT priority (SCHED_FIFO or SCHED_RR). According to the annotate branch profiler it is incorrect most of the time: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 36667 654674 94 rt_policy sched.c 126 This makes sense because the rt_policy() is used by the sched_set_scheduler() and nice(). Although users may use sys_nice a bit, all RT users use the sched_set_scheduler() to set their RT priority, including kernel threads. The above numbers were from a normal desktop computer running firefox, evolution, xchat and was part of a distcc compile farm. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d09ac3..269a0450281c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -123,7 +123,7 @@ static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + if (policy == SCHED_FIFO || policy == SCHED_RR) return 1; return 0; } -- cgit v1.2.3 From e69c634190dc724ef2d845ace8d783031d3e492e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 17:10:31 -0500 Subject: sched: Remove unlikely() from ttwu_post_activation The unlikely() used in ttwu_post_activation() tests if the rq->idle_stamp is set. But since this is for a wakeup, and wakeups happen when tasks block on IO, and blocking tasks on IO may put the system into idle, this can actually be a common occurence. Running the annotated branch profiler on an average desktop running firefox, evolution, xchat and distcc, the report shows: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 34884862 146110926 80 ttwu_post_activation sched.c 2309 80% of the time, this unlikely is incorrect. Best not to assume what the result is, and just remove the branch annotation. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 269a0450281c..6d24b2e8d82d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,7 +2458,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); - if (unlikely(rq->idle_stamp)) { + if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; -- cgit v1.2.3 From 2da8c8bc44b572cbf623629ff736608dc7968436 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Jun 2011 22:53:39 +0200 Subject: sched: Remove pointless in_atomic() definition check It's really supposed to be defined here. If it's not then we actually want the build to crash so that we know it, and not keep it silent. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fd18f395a1bf..01d9536aaa8e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8028,7 +8028,6 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { -#ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || @@ -8050,7 +8049,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) if (irqs_disabled()) print_irqtrace_events(current); dump_stack(); -#endif } EXPORT_SYMBOL(__might_sleep); #endif -- cgit v1.2.3 From bdd4e85dc36cdbcfc1608a5b2a17c80a9db8986a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 01:13:27 +0200 Subject: sched: Isolate preempt counting in its own config option Create a new CONFIG_PREEMPT_COUNT that handles the inc/dec of preempt count offset independently. So that the offset can be updated by preempt_disable() and preempt_enable() even without the need for CONFIG_PREEMPT beeing set. This prepares to make CONFIG_DEBUG_SPINLOCK_SLEEP working with !CONFIG_PREEMPT where it currently doesn't detect code that sleeps inside explicit preemption disabled sections. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 01d9536aaa8e..90ad7cf2b290 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2843,7 +2843,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif -- cgit v1.2.3 From d902db1eb60387040fe541573083e47469db50ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 19:31:56 +0200 Subject: sched: Generalize sleep inside spinlock detection The sleeping inside spinlock detection is actually used for more general sleeping inside atomic sections debugging: preemption disabled, rcu read side critical sections, interrupts, interrupt disabled, etc... Change the name of the config and its help section to reflect its more general role. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Acked-by: Randy Dunlap Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 90ad7cf2b290..a5f318b8d659 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8018,7 +8018,7 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); -- cgit v1.2.3 From 307bf9803f25a8a3f53c1012110fb74e2f893eb0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Jun 2011 15:08:55 +0200 Subject: sched: Simplify mutex_spin_on_owner() It does not make sense to rcu_read_lock/unlock() in every loop iteration while spinning on the mutex. Move the rcu protection outside the loop. Also simplify the return path to always check for lock->owner == NULL which meets the requirements of both owner changed and need_resched() caused loop exits. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1106101458350.11814@ionos Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 59252754fbe0..e355ee72e83f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4306,11 +4306,8 @@ EXPORT_SYMBOL(schedule); static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { - bool ret = false; - - rcu_read_lock(); if (lock->owner != owner) - goto fail; + return false; /* * Ensure we emit the owner->on_cpu, dereference _after_ checking @@ -4320,11 +4317,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner) */ barrier(); - ret = owner->on_cpu; -fail: - rcu_read_unlock(); - - return ret; + return owner->on_cpu; } /* @@ -4336,21 +4329,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (!sched_feat(OWNER_SPIN)) return 0; + rcu_read_lock(); while (owner_running(lock, owner)) { if (need_resched()) - return 0; + break; arch_mutex_cpu_relax(); } + rcu_read_unlock(); /* - * If the owner changed to another task there is likely - * heavy contention, stop spinning. + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. */ - if (lock->owner) - return 0; - - return 1; + return lock->owner == NULL; } #endif -- cgit v1.2.3 From 9763b67fb9f3050c6da739105888327587c30c4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jul 2011 13:09:25 +0200 Subject: sched, cgroup: Optimize load_balance_fair() Use for_each_leaf_cfs_rq() instead of list_for_each_entry_rcu(), this achieves that load_balance_fair() only iterates those task_groups that actually have tasks on busiest, and that we iterate bottom-up, trying to move light groups before the heavier ones. No idea if it will actually work out to be beneficial in practice, does anybody have a cgroup workload that might show a difference one way or the other? [ Also move update_h_load to sched_fair.c, loosing #ifdef-ery ] Signed-off-by: Peter Zijlstra Reviewed-by: Paul Turner Link: http://lkml.kernel.org/r/1310557009.2586.28.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b0e7ad796d3b..474f341d6f91 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1568,38 +1568,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return rq->avg_load_per_task; } -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -#endif - #ifdef CONFIG_PREEMPT static void double_rq_lock(struct rq *rq1, struct rq *rq2); -- cgit v1.2.3 From 5f817d676b7b7ac4a29f5ed93063ae7a24550c12 Mon Sep 17 00:00:00 2001 From: Jan Schoenherr Date: Wed, 13 Jul 2011 20:13:31 +0200 Subject: sched: Fix (harmless) typo 'CONFG_FAIR_GROUP_SCHED' This patch fixes a typo located in a comment. Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-2-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 474f341d6f91..3b3826ebe793 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8362,7 +8362,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); raw_spin_unlock_irqrestore(&rq->lock, flags); } -#else /* !CONFG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } -- cgit v1.2.3 From 99bc52429f11d1f4f81495ac8237085aaeb6bccf Mon Sep 17 00:00:00 2001 From: Bianca Lutz Date: Wed, 13 Jul 2011 20:13:36 +0200 Subject: sched: Do not attempt to destroy uninitialized rt_bandwidth If a task group is to be created and alloc_fair_sched_group() fails, then the rt_bandwidth of the corresponding task group is not yet initialized. The caller, sched_create_group(), starts a clean up procedure which calls free_rt_sched_group() which unconditionally destroys the not yet initialized rt_bandwidth. This crashes or hangs the system in lock_hrtimer_base(): UP systems dereference a NULL pointer, while SMP systems loop endlessly on a condition that cannot become true. This patch simply avoids the destruction of rt_bandwidth when the initialization code path was not reached. (This was discovered by accident with a custom kernel modification.) Signed-off-by: Bianca Lutz Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-7-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3b3826ebe793..f107204db53f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8383,7 +8383,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; - destroy_rt_bandwidth(&tg->rt_bandwidth); + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); for_each_possible_cpu(i) { if (tg->rt_rq) -- cgit v1.2.3 From 26a148eb9c790149750f7e77da0d96029443d400 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 15 Jul 2011 11:41:31 +0100 Subject: sched: Reorder root_domain to remove 64 bit alignment padding Reorder root_domain to remove 8 bytes of alignment padding on 64 bit builds, this shrinks the size from 1736 to 1728 bytes, therefore using one fewer cachelines. Signed-off-by: Richard Kennedy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310726492.1977.5.camel@castor.rsk Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f107204db53f..e3f0bac05270 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -422,6 +422,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + atomic_t rto_count; struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -431,7 +432,6 @@ struct root_domain { * one runnable RT task. */ cpumask_var_t rto_mask; - atomic_t rto_count; struct cpupri cpupri; }; -- cgit v1.2.3 From acb5a9ba3bd7cd8b3264f67a3789a9587d3b935b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Thu, 14 Jul 2011 18:32:43 +0200 Subject: sched: Separate group-scheduling code more clearly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up cfs/rt runqueue initialization by moving group scheduling related code into the corresponding functions. Also, keep group scheduling as an add-on, so that things are only done additionally, i. e. remove the init_*_rq() calls from init_tg_*_entry(). (This removes a redundant initalization during sched_init()). In case of group scheduling rt_rq->highest_prio.curr is now initialized twice, but adding another #ifdef seems not worth it. Signed-off-by: Jan H. Schönherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310661163-16606-1-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e3f0bac05270..6fdf7ffbebc6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7859,17 +7859,10 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; - /* allow initial update_cfs_load() to truncate */ -#ifdef CONFIG_SMP - cfs_rq->load_stamp = 1; -#endif -#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -7889,13 +7882,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO; -#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; -#endif -#endif -#ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); @@ -7905,11 +7894,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7918,11 +7902,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; +#endif + tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; + /* se could be NULL for root_task_group */ if (!se) return; @@ -7945,12 +7935,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; rt_rq->tg = tg; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + tg->rt_rq[cpu] = rt_rq; tg->rt_se[cpu] = rt_se; + if (!rt_se) return; @@ -8032,7 +8024,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; - init_cfs_rq(&rq->cfs, rq); + init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = root_task_group_load; @@ -8335,6 +8327,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; + init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } @@ -8425,6 +8418,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } -- cgit v1.2.3