summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c497
-rw-r--r--kernel/sched_fair.c124
-rw-r--r--kernel/sched_rt.c4
3 files changed, 547 insertions, 78 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 62d7481caca5..ae1a3e936d28 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex);
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif
+#define MIN_SHARES 2
+
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif
@@ -403,6 +405,43 @@ struct cfs_rq {
*/
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+ unsigned long task_weight;
+ unsigned long shares;
+ /*
+ * We need space to build a sched_domain wide view of the full task
+ * group tree, in order to avoid depending on dynamic memory allocation
+ * during the load balancing we place this in the per cpu task group
+ * hierarchy. This limits the load balancing to one instance per cpu,
+ * but more should not be needed anyway.
+ */
+ struct aggregate_struct {
+ /*
+ * load = weight(cpus) * f(tg)
+ *
+ * Where f(tg) is the recursive weight fraction assigned to
+ * this group.
+ */
+ unsigned long load;
+
+ /*
+ * part of the group weight distributed to this span.
+ */
+ unsigned long shares;
+
+ /*
+ * The sum of all runqueue weights within this span.
+ */
+ unsigned long rq_weight;
+
+ /*
+ * Weight contributed by tasks; this is the part we can
+ * influence by moving tasks around.
+ */
+ unsigned long task_weight;
+ } aggregate;
+#endif
#endif
};
@@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
#endif
+static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+{
+ update_load_add(&rq->load, load);
+}
+
+static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+{
+ update_load_sub(&rq->load, load);
+}
+
#ifdef CONFIG_SMP
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long cpu_avg_load_per_task(int cpu);
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ * root 1 - thread
+ * / | \ A - group
+ * A 1 B
+ * /|\ / \
+ * C 2 D 3 4
+ * | |
+ * 5 6
+ *
+ * load:
+ * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ * which equals 1/9-th of the total load.
+ *
+ * shares:
+ * The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ * B would get 2.
+ *
+ * task_weight:
+ * Part of the rq_weight contributed by tasks; all groups except B would
+ * get 1, B gets 2.
+ */
+
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+ return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+ struct sched_domain *sd)
+{
+ struct task_group *parent, *child;
+
+ rcu_read_lock();
+ parent = &root_task_group;
+down:
+ (*down)(parent, sd);
+ list_for_each_entry_rcu(child, &parent->children, siblings) {
+ parent = child;
+ goto down;
+
+up:
+ continue;
+ }
+ (*up)(parent, sd);
+
+ child = parent;
+ parent = parent->parent;
+ if (parent)
+ goto up;
+ rcu_read_unlock();
+}
+
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+ unsigned long rq_weight = 0;
+ unsigned long task_weight = 0;
+ int i;
+
+ for_each_cpu_mask(i, sd->span) {
+ rq_weight += tg->cfs_rq[i]->load.weight;
+ task_weight += tg->cfs_rq[i]->task_weight;
+ }
+
+ aggregate(tg, sd)->rq_weight = rq_weight;
+ aggregate(tg, sd)->task_weight = task_weight;
+}
+
+/*
+ * Redistribute tg->shares amongst all tg->cfs_rq[]s.
+ */
+static void __aggregate_redistribute_shares(struct task_group *tg)
+{
+ int i, max_cpu = smp_processor_id();
+ unsigned long rq_weight = 0;
+ unsigned long shares, max_shares = 0, shares_rem = tg->shares;
+
+ for_each_possible_cpu(i)
+ rq_weight += tg->cfs_rq[i]->load.weight;
+
+ for_each_possible_cpu(i) {
+ /*
+ * divide shares proportional to the rq_weights.
+ */
+ shares = tg->shares * tg->cfs_rq[i]->load.weight;
+ shares /= rq_weight + 1;
+
+ tg->cfs_rq[i]->shares = shares;
+
+ if (shares > max_shares) {
+ max_shares = shares;
+ max_cpu = i;
+ }
+ shares_rem -= shares;
+ }
+
+ /*
+ * Ensure it all adds up to tg->shares; we can loose a few
+ * due to rounding down when computing the per-cpu shares.
+ */
+ if (shares_rem)
+ tg->cfs_rq[max_cpu]->shares += shares_rem;
+}
+
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+ unsigned long shares = 0;
+ int i;
+
+again:
+ for_each_cpu_mask(i, sd->span)
+ shares += tg->cfs_rq[i]->shares;
+
+ /*
+ * When the span doesn't have any shares assigned, but does have
+ * tasks to run do a machine wide rebalance (should be rare).
+ */
+ if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
+ __aggregate_redistribute_shares(tg);
+ goto again;
+ }
+
+ aggregate(tg, sd)->shares = shares;
+}
+
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+ unsigned long load;
+
+ if (!tg->parent) {
+ int i;
+
+ load = 0;
+ for_each_cpu_mask(i, sd->span)
+ load += cpu_rq(i)->load.weight;
+
+ } else {
+ load = aggregate(tg->parent, sd)->load;
+
+ /*
+ * shares is our weight in the parent's rq so
+ * shares/parent->rq_weight gives our fraction of the load
+ */
+ load *= aggregate(tg, sd)->shares;
+ load /= aggregate(tg->parent, sd)->rq_weight + 1;
+ }
+
+ aggregate(tg, sd)->load = load;
+}
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+ int tcpu)
+{
+ int boost = 0;
+ unsigned long shares;
+ unsigned long rq_weight;
+
+ if (!tg->se[tcpu])
+ return;
+
+ rq_weight = tg->cfs_rq[tcpu]->load.weight;
+
+ /*
+ * If there are currently no tasks on the cpu pretend there is one of
+ * average load so that when a new task gets to run here it will not
+ * get delayed by group starvation.
+ */
+ if (!rq_weight) {
+ boost = 1;
+ rq_weight = NICE_0_LOAD;
+ }
+
+ /*
+ * \Sum shares * rq_weight
+ * shares = -----------------------
+ * \Sum rq_weight
+ *
+ */
+ shares = aggregate(tg, sd)->shares * rq_weight;
+ shares /= aggregate(tg, sd)->rq_weight + 1;
+
+ /*
+ * record the actual number of shares, not the boosted amount.
+ */
+ tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+
+ __set_se_shares(tg->se[tcpu], shares);
+}
+
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+ int scpu, int dcpu)
+{
+ unsigned long shares;
+
+ shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+
+ __update_group_shares_cpu(tg, sd, scpu);
+ __update_group_shares_cpu(tg, sd, dcpu);
+
+ /*
+ * ensure we never loose shares due to rounding errors in the
+ * above redistribution.
+ */
+ shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+ if (shares)
+ tg->cfs_rq[dcpu]->shares += shares;
+}
+
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+ int scpu, int dcpu)
+{
+ while (tg) {
+ __move_group_shares(tg, sd, scpu, dcpu);
+ tg = tg->parent;
+ }
+}
+
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+ unsigned long shares = aggregate(tg, sd)->shares;
+ int i;
+
+ for_each_cpu_mask(i, sd->span) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __update_group_shares_cpu(tg, sd, i);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ aggregate_group_shares(tg, sd);
+
+ /*
+ * ensure we never loose shares due to rounding errors in the
+ * above redistribution.
+ */
+ shares -= aggregate(tg, sd)->shares;
+ if (shares) {
+ tg->cfs_rq[sd->first_cpu]->shares += shares;
+ aggregate(tg, sd)->shares += shares;
+ }
+}
+
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+ aggregate_group_weight(tg, sd);
+ aggregate_group_shares(tg, sd);
+ aggregate_group_load(tg, sd);
+}
+
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+ aggregate_group_set_shares(tg, sd);
+}
+
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+
+static void __init init_aggregate(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+
+static int get_aggregate(struct sched_domain *sd)
+{
+ if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+ return 0;
+
+ aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+ return 1;
+}
+
+static void put_aggregate(struct sched_domain *sd)
+{
+ spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+ cfs_rq->shares = shares;
+}
+
+#else
+
+static inline void init_aggregate(void)
+{
+}
+
+static inline int get_aggregate(struct sched_domain *sd)
+{
+ return 0;
+}
+
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
+
+#else /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+}
+#endif
+
#endif /* CONFIG_SMP */
#include "sched_stats.h"
@@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
#define sched_class_highest (&rt_sched_class)
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
- update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
- update_load_sub(&rq->load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void inc_nr_running(struct rq *rq)
{
rq->nr_running++;
- inc_load(rq, p);
}
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
{
rq->nr_running--;
- dec_load(rq, p);
}
static void set_load_weight(struct task_struct *p)
@@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
rq->nr_uninterruptible--;
enqueue_task(rq, p, wakeup);
- inc_nr_running(p, rq);
+ inc_nr_running(rq);
}
/*
@@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
rq->nr_uninterruptible++;
dequeue_task(rq, p, sleep);
- dec_nr_running(p, rq);
+ dec_nr_running(rq);
}
/**
@@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* management (if any):
*/
p->sched_class->task_new(rq, p);
- inc_nr_running(p, rq);
+ inc_nr_running(rq);
}
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
@@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
+ int unlock_aggregate;
cpus_setall(*cpus);
+ unlock_aggregate = get_aggregate(sd);
+
/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
@@ -3303,8 +3712,9 @@ redo:
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- return -1;
- return ld_moved;
+ ld_moved = -1;
+
+ goto out;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
@@ -3319,8 +3729,13 @@ out_one_pinned:
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- return -1;
- return 0;
+ ld_moved = -1;
+ else
+ ld_moved = 0;
+out:
+ if (unlock_aggregate)
+ put_aggregate(sd);
+ return ld_moved;
}
/*
@@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock;
}
on_rq = p->se.on_rq;
- if (on_rq) {
+ if (on_rq)
dequeue_task(rq, p, 0);
- dec_load(rq, p);
- }
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice)
if (on_rq) {
enqueue_task(rq, p, 0);
- inc_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
sd->span = *cpu_map;
+ sd->first_cpu = first_cpu(sd->span);
cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
p = sd;
sd_allnodes = 1;
@@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), &sd->span);
+ sd->first_cpu = first_cpu(sd->span);
sd->parent = p;
if (p)
p->child = sd;
@@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, CPU);
set_domain_attribute(sd, attr);
sd->span = *nodemask;
+ sd->first_cpu = first_cpu(sd->span);
sd->parent = p;
if (p)
p->child = sd;
@@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, MC);
set_domain_attribute(sd, attr);
sd->span = cpu_coregroup_map(i);
+ sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr);
sd->span = per_cpu(cpu_sibling_map, i);
+ sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -7633,6 +8050,7 @@ void __init sched_init(void)
}
#ifdef CONFIG_SMP
+ init_aggregate();
init_defrootdomain();
#endif
@@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
- struct rq *rq = cfs_rq->rq;
int on_rq;
- spin_lock_irq(&rq->lock);
-
on_rq = se->on_rq;
if (on_rq)
dequeue_entity(cfs_rq, se, 0);
@@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
if (on_rq)
enqueue_entity(cfs_rq, se, 0);
+}
- spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ struct rq *rq = cfs_rq->rq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __set_se_shares(se, shares);
+ spin_unlock_irqrestore(&rq->lock, flags);
}
static DEFINE_MUTEX(shares_mutex);
@@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
- if (shares < 2)
- shares = 2;
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
mutex_lock(&shares_mutex);
if (tg->shares == shares)
@@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* w/o tripping rebalance_share or load_balance_fair.
*/
tg->shares = shares;
- for_each_possible_cpu(i)
- set_se_shares(tg->se[i], shares);
+ for_each_possible_cpu(i) {
+ /*
+ * force a rebalance
+ */
+ cfs_rq_set_shares(tg->cfs_rq[i], 0);
+ set_se_shares(tg->se[i], shares/nr_cpu_ids);
+ }
/*
* Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b43748efaa7f..b89fec93a237 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+ cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+ if (entity_is_task(se))
+ add_cfs_task_weight(cfs_rq, se->load.weight);
cfs_rq->nr_running++;
se->on_rq = 1;
}
@@ -504,6 +521,10 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
+ if (!parent_entity(se))
+ dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+ if (entity_is_task(se))
+ add_cfs_task_weight(cfs_rq, -se->load.weight);
cfs_rq->nr_running--;
se->on_rq = 0;
}
@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+static unsigned long
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move, struct sched_domain *sd,
+ enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+ struct cfs_rq *cfs_rq)
{
- struct sched_entity *curr;
- struct task_struct *p;
-
- if (!cfs_rq->nr_running || !first_fair(cfs_rq))
- return MAX_PRIO;
-
- curr = cfs_rq->curr;
- if (!curr)
- curr = __pick_next_entity(cfs_rq);
+ struct rq_iterator cfs_rq_iterator;
- p = task_of(curr);
+ cfs_rq_iterator.start = load_balance_start_fair;
+ cfs_rq_iterator.next = load_balance_next_fair;
+ cfs_rq_iterator.arg = cfs_rq;
- return p->prio;
+ return balance_tasks(this_rq, this_cpu, busiest,
+ max_load_move, sd, idle, all_pinned,
+ this_best_prio, &cfs_rq_iterator);
}
-#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
- struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
- struct rq_iterator cfs_rq_iterator;
+ int busiest_cpu = cpu_of(busiest);
+ struct task_group *tg;
- cfs_rq_iterator.start = load_balance_start_fair;
- cfs_rq_iterator.next = load_balance_next_fair;
-
- for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct cfs_rq *this_cfs_rq;
+ rcu_read_lock();
+ list_for_each_entry(tg, &task_groups, list) {
long imbalance;
- unsigned long maxload;
+ unsigned long this_weight, busiest_weight;
+ long rem_load, max_load, moved_load;
+
+ /*
+ * empty group
+ */
+ if (!aggregate(tg, sd)->task_weight)
+ continue;
+
+ rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
+ rem_load /= aggregate(tg, sd)->load + 1;
- this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+ this_weight = tg->cfs_rq[this_cpu]->task_weight;
+ busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
- imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
- /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
- if (imbalance <= 0)
+ imbalance = (busiest_weight - this_weight) / 2;
+
+ if (imbalance < 0)
+ imbalance = busiest_weight;
+
+ max_load = max(rem_load, imbalance);
+ moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+ max_load, sd, idle, all_pinned, this_best_prio,
+ tg->cfs_rq[busiest_cpu]);
+
+ if (!moved_load)
continue;
- /* Don't pull more than imbalance/2 */
- imbalance /= 2;
- maxload = min(rem_load_move, imbalance);
+ move_group_shares(tg, sd, busiest_cpu, this_cpu);
- *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-#else
-# define maxload rem_load_move
-#endif
- /*
- * pass busy_cfs_rq argument into
- * load_balance_[start|next]_fair iterators
- */
- cfs_rq_iterator.arg = busy_cfs_rq;
- rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
- maxload, sd, idle, all_pinned,
- this_best_prio,
- &cfs_rq_iterator);
+ moved_load *= aggregate(tg, sd)->load;
+ moved_load /= aggregate(tg, sd)->rq_weight + 1;
- if (rem_load_move <= 0)
+ rem_load_move -= moved_load;
+ if (rem_load_move < 0)
break;
}
+ rcu_read_unlock();
return max_load_move - rem_load_move;
}
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio)
+{
+ return __load_balance_fair(this_rq, this_cpu, busiest,
+ max_load_move, sd, idle, all_pinned,
+ this_best_prio, &busiest->cfs);
+}
+#endif
static int
move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 201a69382a42..736fb8fd8977 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
*/
for_each_sched_rt_entity(rt_se)
enqueue_rt_entity(rt_se);
+
+ inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
if (rt_rq && rt_rq->rt_nr_running)
enqueue_rt_entity(rt_se);
}
+
+ dec_cpu_load(rq, p->se.load.weight);
}
/*