From 685207963be973fbb73550db6edaf920a283e1a7 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 15 Jul 2013 17:49:19 +0400 Subject: sched: Move h_load calculation to task_h_load() The bad thing about update_h_load(), which computes hierarchical load factor for task groups, is that it is called for each task group in the system before every load balancer run, and since rebalance can be triggered very often, this function can eat really a lot of cpu time if there are many cpu cgroups in the system. Although the situation was improved significantly by commit a35b646 ('sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies'), the problem still can arise under some kinds of loads, e.g. when cpus are switching from idle to busy and back very frequently. For instance, when I start 1000 of processes that wake up every millisecond on my 8 cpus host, 'top' and 'perf top' show: Cpu(s): 17.8%us, 24.3%sy, 0.0%ni, 57.9%id, 0.0%wa, 0.0%hi, 0.0%si Events: 243K cycles 7.57% [kernel] [k] __schedule 7.08% [kernel] [k] timerqueue_add 6.13% libc-2.12.so [.] usleep Then if I create 10000 *idle* cpu cgroups (no processes in them), cpu usage increases significantly although the 'wakers' are still executing in the root cpu cgroup: Cpu(s): 19.1%us, 48.7%sy, 0.0%ni, 31.6%id, 0.0%wa, 0.0%hi, 0.7%si Events: 230K cycles 24.56% [kernel] [k] tg_load_down 5.76% [kernel] [k] __schedule This happens because this particular kind of load triggers 'new idle' rebalance very frequently, which requires calling update_h_load(), which, in turn, calls tg_load_down() for every *idle* cpu cgroup even though it is absolutely useless, because idle cpu cgroups have no tasks to pull. This patch tries to improve the situation by making h_load calculation proceed only when h_load is really necessary. To achieve this, it substitutes update_h_load() with update_cfs_rq_h_load(), which computes h_load only for a given cfs_rq and all its ascendants, and makes the load balancer call this function whenever it considers if a task should be pulled, i.e. it moves h_load calculations directly to task_h_load(). For h_load of the same cfs_rq not to be updated multiple times (in case several tasks in the same cgroup are considered during the same balance run), the patch keeps the time of the last h_load update for each cfs_rq and breaks calculation when it finds h_load to be uptodate. The benefit of it is that h_load is computed only for those cfs_rq's, which really need it, in particular all idle task groups are skipped. Although this, in fact, moves h_load calculation under rq lock, it should not affect latency much, because the amount of work done under rq lock while trying to pull tasks is limited by sched_nr_migrate. After the patch applied with the setup described above (1000 wakers in the root cgroup and 10000 idle cgroups), I get: Cpu(s): 16.9%us, 24.8%sy, 0.0%ni, 58.4%id, 0.0%wa, 0.0%hi, 0.0%si Events: 242K cycles 7.57% [kernel] [k] __schedule 6.70% [kernel] [k] timerqueue_add 5.93% libc-2.12.so [.] usleep Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1373896159-1278-1-git-send-email-vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 58 ++++++++++++++++++++++++---------------------------- kernel/sched/sched.h | 7 +++---- 2 files changed, 30 insertions(+), 35 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb456f44b7b1..765d87acdf05 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4171,47 +4171,48 @@ static void update_blocked_averages(int cpu) } /* - * Compute the cpu's hierarchical load factor for each task group. + * Compute the hierarchical load factor for cfs_rq and all its ascendants. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static int tg_load_down(struct task_group *tg, void *data) +static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) { - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->avg.load_avg_contrib; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, - tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - struct rq *rq = cpu_rq(cpu); + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; unsigned long now = jiffies; + unsigned long load; - if (rq->h_load_throttle == now) + if (cfs_rq->last_h_load_update == now) return; - rq->h_load_throttle = now; + cfs_rq->h_load_next = NULL; + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + cfs_rq->h_load_next = se; + if (cfs_rq->last_h_load_update == now) + break; + } - rcu_read_lock(); - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); - rcu_read_unlock(); + if (!se) { + cfs_rq->h_load = rq->avg.load_avg_contrib; + cfs_rq->last_h_load_update = now; + } + + while ((se = cfs_rq->h_load_next) != NULL) { + load = cfs_rq->h_load; + load = div64_ul(load * se->avg.load_avg_contrib, + cfs_rq->runnable_load_avg + 1); + cfs_rq = group_cfs_rq(se); + cfs_rq->h_load = load; + cfs_rq->last_h_load_update = now; + } } static unsigned long task_h_load(struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); + update_cfs_rq_h_load(cfs_rq); return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, cfs_rq->runnable_load_avg + 1); } @@ -4220,10 +4221,6 @@ static inline void update_blocked_averages(int cpu) { } -static inline void update_h_load(long cpu) -{ -} - static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg_contrib; @@ -5108,7 +5105,6 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); - update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(env.dst_rq, busiest); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..5e129efb84ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -285,7 +285,6 @@ struct cfs_rq { /* Required to track per-cpu representation of a task_group */ u32 tg_runnable_contrib; unsigned long tg_load_contrib; -#endif /* CONFIG_FAIR_GROUP_SCHED */ /* * h_load = weight * f(tg) @@ -294,6 +293,9 @@ struct cfs_rq { * this group. */ unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -429,9 +431,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#ifdef CONFIG_SMP - unsigned long h_load_throttle; -#endif /* CONFIG_SMP */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.2.3 From 62470419e993f8d9d93db0effd3af4296ecb79a5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 4 Jul 2013 12:55:51 +0800 Subject: sched: Implement smarter wake-affine logic The wake-affine scheduler feature is currently always trying to pull the wakee close to the waker. In theory this should be beneficial if the waker's CPU caches hot data for the wakee, and it's also beneficial in the extreme ping-pong high context switch rate case. Testing shows it can benefit hackbench up to 15%. However, the feature is somewhat blind, from which some workloads such as pgbench suffer. It's also time-consuming algorithmically. Testing shows it can damage pgbench up to 50% - far more than the benefit it brings in the best case. So wake-affine should be smarter and it should realize when to stop its thankless effort at trying to find a suitable CPU to wake on. This patch introduces 'wakee_flips', which will be increased each time the task flips (switches) its wakee target. So a high 'wakee_flips' value means the task has more than one wakee, and the bigger the number, the higher the wakeup frequency. Now when making the decision on whether to pull or not, pay attention to the wakee with a high 'wakee_flips', pulling such a task may benefit the wakee. Also imply that the waker will face cruel competition later, it could be very cruel or very fast depends on the story behind 'wakee_flips', waker therefore suffers. Furthermore, if waker also has a high 'wakee_flips', that implies that multiple tasks rely on it, then waker's higher latency will damage all of them, so pulling wakee seems to be a bad deal. Thus, when 'waker->wakee_flips / wakee->wakee_flips' becomes higher and higher, the cost of pulling seems to be worse and worse. The patch therefore helps the wake-affine feature to stop its pulling work when: wakee->wakee_flips > factor && waker->wakee_flips > (factor * wakee->wakee_flips) The 'factor' here is the number of CPUs in the current CPU's NUMA node, so a bigger node will lead to more pulling since the trial becomes more severe. After applying the patch, pgbench shows up to 40% improvements and no regressions. Tested with 12 cpu x86 server and tip 3.10.0-rc7. The percentages in the final column highlight the areas with the biggest wins, all other areas improved as well: pgbench base smart | db_size | clients | tps | | tps | +---------+---------+-------+ +-------+ | 22 MB | 1 | 10598 | | 10796 | | 22 MB | 2 | 21257 | | 21336 | | 22 MB | 4 | 41386 | | 41622 | | 22 MB | 8 | 51253 | | 57932 | | 22 MB | 12 | 48570 | | 54000 | | 22 MB | 16 | 46748 | | 55982 | +19.75% | 22 MB | 24 | 44346 | | 55847 | +25.93% | 22 MB | 32 | 43460 | | 54614 | +25.66% | 7484 MB | 1 | 8951 | | 9193 | | 7484 MB | 2 | 19233 | | 19240 | | 7484 MB | 4 | 37239 | | 37302 | | 7484 MB | 8 | 46087 | | 50018 | | 7484 MB | 12 | 42054 | | 48763 | | 7484 MB | 16 | 40765 | | 51633 | +26.66% | 7484 MB | 24 | 37651 | | 52377 | +39.11% | 7484 MB | 32 | 37056 | | 51108 | +37.92% | 15 GB | 1 | 8845 | | 9104 | | 15 GB | 2 | 19094 | | 19162 | | 15 GB | 4 | 36979 | | 36983 | | 15 GB | 8 | 46087 | | 49977 | | 15 GB | 12 | 41901 | | 48591 | | 15 GB | 16 | 40147 | | 50651 | +26.16% | 15 GB | 24 | 37250 | | 52365 | +40.58% | 15 GB | 32 | 36470 | | 50015 | +37.14% Signed-off-by: Michael Wang Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51D50057.9000809@linux.vnet.ibm.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 765d87acdf05..860063a8c849 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3017,6 +3017,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } +static void record_wakee(struct task_struct *p) +{ + /* + * Rough decay (wiping) for cost saving, don't worry + * about the boundary, really active task won't care + * about the loss. + */ + if (jiffies > current->wakee_flip_decay_ts + HZ) { + current->wakee_flips = 0; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} static void task_waking_fair(struct task_struct *p) { @@ -3037,6 +3054,7 @@ static void task_waking_fair(struct task_struct *p) #endif se->vruntime -= min_vruntime; + record_wakee(p); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3155,6 +3173,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif +static int wake_wide(struct task_struct *p) +{ + int factor = nr_cpus_node(cpu_to_node(smp_processor_id())); + + /* + * Yeah, it's the switching-frequency, could means many wakee or + * rapidly switch, use factor here will just help to automatically + * adjust the loose-degree, so bigger node will lead to more pull. + */ + if (p->wakee_flips > factor) { + /* + * wakee is somewhat hot, it needs certain amount of cpu + * resource, so if waker is far more hot, prefer to leave + * it alone. + */ + if (current->wakee_flips > (factor * p->wakee_flips)) + return 1; + } + + return 0; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; @@ -3164,6 +3204,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; + /* + * If we wake multiple tasks be careful to not bounce + * ourselves around too much. + */ + if (wake_wide(p)) + return 0; + idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); -- cgit v1.2.3 From 7d9ffa8961482232d964173cccba6e14d2d543b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jul 2013 12:56:46 +0800 Subject: sched: Micro-optimize the smart wake-affine logic Smart wake-affine is using node-size as the factor currently, but the overhead of the mask operation is high. Thus, this patch introduce the 'sd_llc_size' percpu variable, which will record the highest cache-share domain size, and make it to be the new factor, in order to reduce the overhead and make it more reasonable. Tested-by: Davidlohr Bueso Tested-by: Michael Wang Signed-off-by: Peter Zijlstra Acked-by: Michael Wang Cc: Mike Galbraith Link: http://lkml.kernel.org/r/51D5008E.6030102@linux.vnet.ibm.com [ Tidied up the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++++- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7c32cb7bfeb..6df0fbe53767 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5083,18 +5083,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; int id = cpu; + int size = 1; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { id = cpumask_first(sched_domain_span(sd)); + size = cpumask_weight(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 860063a8c849..f237437446e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3175,7 +3175,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, static int wake_wide(struct task_struct *p) { - int factor = nr_cpus_node(cpu_to_node(smp_processor_id())); + int factor = this_cpu_read(sd_llc_size); /* * Yeah, it's the switching-frequency, could means many wakee or diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5e129efb84ce..4c1cb8029feb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) } DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); struct sched_group_power { -- cgit v1.2.3