summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-rt-group.rst40
-rw-r--r--include/linux/cpu.h2
-rw-r--r--include/linux/list.h8
-rw-r--r--include/linux/numa.h7
-rw-r--r--include/linux/preempt.h15
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/sched/deadline.h4
-rw-r--r--include/linux/sched/signal.h3
-rw-r--r--include/linux/sched/smt.h2
-rw-r--r--include/linux/sched/types.h2
-rw-r--r--include/linux/sched/vhost_task.h7
-rw-r--r--include/linux/topology.h2
-rw-r--r--include/trace/events/sched.h5
-rw-r--r--include/uapi/linux/sched/types.h4
-rw-r--r--kernel/freezer.c41
-rw-r--r--kernel/sched/build_utility.c1
-rw-r--r--kernel/sched/core.c603
-rw-r--r--kernel/sched/deadline.c67
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c130
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/rt.c85
-rw-r--r--kernel/sched/sched.h26
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/sched/topology.c25
-rw-r--r--lib/cpumask.c4
-rw-r--r--mm/mempolicy.c18
27 files changed, 473 insertions, 650 deletions
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
index 655a096ec8fb..d685609ed3d7 100644
--- a/Documentation/scheduler/sched-rt-group.rst
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -39,10 +39,10 @@ Most notable:
1.1 The problem
---------------
-Realtime scheduling is all about determinism, a group has to be able to rely on
+Real-time scheduling is all about determinism, a group has to be able to rely on
the amount of bandwidth (eg. CPU time) being constant. In order to schedule
-multiple groups of realtime tasks, each group must be assigned a fixed portion
-of the CPU time available. Without a minimum guarantee a realtime group can
+multiple groups of real-time tasks, each group must be assigned a fixed portion
+of the CPU time available. Without a minimum guarantee a real-time group can
obviously fall short. A fuzzy upper limit is of no use since it cannot be
relied upon. Which leaves us with just the single fixed portion.
@@ -50,14 +50,14 @@ relied upon. Which leaves us with just the single fixed portion.
----------------
CPU time is divided by means of specifying how much time can be spent running
-in a given period. We allocate this "run time" for each realtime group which
-the other realtime groups will not be permitted to use.
+in a given period. We allocate this "run time" for each real-time group which
+the other real-time groups will not be permitted to use.
-Any time not allocated to a realtime group will be used to run normal priority
+Any time not allocated to a real-time group will be used to run normal priority
tasks (SCHED_OTHER). Any allocated run time not used will also be picked up by
SCHED_OTHER.
-Let's consider an example: a frame fixed realtime renderer must deliver 25
+Let's consider an example: a frame fixed real-time renderer must deliver 25
frames a second, which yields a period of 0.04s per frame. Now say it will also
have to play some music and respond to input, leaving it with around 80% CPU
time dedicated for the graphics. We can then give this group a run time of 0.8
@@ -70,7 +70,7 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s =
of 0.00015s.
The remaining CPU time will be used for user input and other tasks. Because
-realtime tasks have explicitly allocated the CPU time they need to perform
+real-time tasks have explicitly allocated the CPU time they need to perform
their tasks, buffer underruns in the graphics or audio can be eliminated.
NOTE: the above example is not fully implemented yet. We still
@@ -87,18 +87,20 @@ lack an EDF scheduler to make non-uniform periods usable.
The system wide settings are configured under the /proc virtual file system:
/proc/sys/kernel/sched_rt_period_us:
- The scheduling period that is equivalent to 100% CPU bandwidth
+ The scheduling period that is equivalent to 100% CPU bandwidth.
/proc/sys/kernel/sched_rt_runtime_us:
- A global limit on how much time realtime scheduling may use. Even without
- CONFIG_RT_GROUP_SCHED enabled, this will limit time reserved to realtime
- processes. With CONFIG_RT_GROUP_SCHED it signifies the total bandwidth
- available to all realtime groups.
+ A global limit on how much time real-time scheduling may use. This is always
+ less or equal to the period_us, as it denotes the time allocated from the
+ period_us for the real-time tasks. Even without CONFIG_RT_GROUP_SCHED enabled,
+ this will limit time reserved to real-time processes. With
+ CONFIG_RT_GROUP_SCHED=y it signifies the total bandwidth available to all
+ real-time groups.
* Time is specified in us because the interface is s32. This gives an
operating range from 1us to about 35 minutes.
* sched_rt_period_us takes values from 1 to INT_MAX.
- * sched_rt_runtime_us takes values from -1 to (INT_MAX - 1).
+ * sched_rt_runtime_us takes values from -1 to sched_rt_period_us.
* A run time of -1 specifies runtime == period, ie. no limit.
@@ -108,7 +110,7 @@ The system wide settings are configured under the /proc virtual file system:
The default values for sched_rt_period_us (1000000 or 1s) and
sched_rt_runtime_us (950000 or 0.95s). This gives 0.05s to be used by
SCHED_OTHER (non-RT tasks). These defaults were chosen so that a run-away
-realtime tasks will not lock up the machine but leave a little time to recover
+real-time tasks will not lock up the machine but leave a little time to recover
it. By setting runtime to -1 you'd get the old behaviour back.
By default all bandwidth is assigned to the root group and new groups get the
@@ -116,10 +118,10 @@ period from /proc/sys/kernel/sched_rt_period_us and a run time of 0. If you
want to assign bandwidth to another group, reduce the root group's bandwidth
and assign some or all of the difference to another group.
-Realtime group scheduling means you have to assign a portion of total CPU
-bandwidth to the group before it will accept realtime tasks. Therefore you will
-not be able to run realtime tasks as any user other than root until you have
-done that, even if the user has the rights to run processes with realtime
+Real-time group scheduling means you have to assign a portion of total CPU
+bandwidth to the group before it will accept real-time tasks. Therefore you will
+not be able to run real-time tasks as any user other than root until you have
+done that, even if the user has the rights to run processes with real-time
priority!
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 0abd60a7987b..f19f56501809 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -153,6 +153,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; }
static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { }
#endif /* !CONFIG_HOTPLUG_CPU */
+DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock())
+
#ifdef CONFIG_PM_SLEEP_SMP
extern int freeze_secondary_cpus(int primary);
extern void thaw_secondary_cpus(void);
diff --git a/include/linux/list.h b/include/linux/list.h
index 164b4d0e9d2a..1837caedf723 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -687,6 +687,14 @@ static inline void list_splice_tail_init(struct list_head *list,
for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)
/**
+ * list_for_each_reverse - iterate backwards over a list
+ * @pos: the &struct list_head to use as a loop cursor.
+ * @head: the head for your list.
+ */
+#define list_for_each_reverse(pos, head) \
+ for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+/**
* list_for_each_rcu - Iterate over a list in an RCU-safe fashion
* @pos: the &struct list_head to use as a loop cursor.
* @head: the head for your list.
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 59df211d051f..fb30a42f0700 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -25,7 +25,7 @@
#include <asm/sparsemem.h>
/* Generic implementation available */
-int numa_map_to_online_node(int node);
+int numa_nearest_node(int node, unsigned int state);
#ifndef memory_add_physaddr_to_nid
static inline int memory_add_physaddr_to_nid(u64 start)
@@ -44,10 +44,11 @@ static inline int phys_to_target_node(u64 start)
}
#endif
#else /* !CONFIG_NUMA */
-static inline int numa_map_to_online_node(int node)
+static inline int numa_nearest_node(int node, unsigned int state)
{
return NUMA_NO_NODE;
}
+
static inline int memory_add_physaddr_to_nid(u64 start)
{
return 0;
@@ -58,6 +59,8 @@ static inline int phys_to_target_node(u64 start)
}
#endif
+#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE)
+
#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
extern const struct attribute_group arch_node_dev_group;
#endif
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1424670df161..9aa6358a1a16 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void)
return level;
}
+/*
+ * These macro definitions avoid redundant invocations of preempt_count()
+ * because such invocations would result in redundant loads given that
+ * preempt_count() is commonly implemented with READ_ONCE().
+ */
+
#define nmi_count() (preempt_count() & NMI_MASK)
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#ifdef CONFIG_PREEMPT_RT
# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK)
+# define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
#else
# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
+# define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
#endif
-#define irq_count() (nmi_count() | hardirq_count() | softirq_count())
/*
* Macros to retrieve the current execution context:
@@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void)
#define in_nmi() (nmi_count())
#define in_hardirq() (hardirq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
-#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq()))
+#ifdef CONFIG_PREEMPT_RT
+# define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
+#else
+# define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+#endif
/*
* The following macros are deprecated and should not be used in new code:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 77f01ac385f7..e4235bbfad77 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,7 +63,6 @@ struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
-struct sched_param;
struct seq_file;
struct sighand_struct;
struct signal_struct;
@@ -370,6 +369,10 @@ extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
#endif
+struct sched_param {
+ int sched_priority;
+};
+
struct sched_info {
#ifdef CONFIG_SCHED_INFO
/* Cumulative counters: */
@@ -750,10 +753,8 @@ struct task_struct {
#endif
unsigned int __state;
-#ifdef CONFIG_PREEMPT_RT
/* saved state for "spinlock sleepers" */
unsigned int saved_state;
-#endif
/*
* This begins the randomizable portion of task_struct. Only
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 7c83d4d5a971..df3aca89d4f5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,4 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_DEADLINE_H
+#define _LINUX_SCHED_DEADLINE_H
/*
* SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p);
extern void dl_clear_root_domain(struct root_domain *rd);
#endif /* CONFIG_SMP */
+
+#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..9610bad018a3 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void);
while ((t = next_thread(t)) != g)
#define __for_each_thread(signal, t) \
- list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
+ lockdep_is_held(&tasklist_lock))
#define for_each_thread(p, t) \
__for_each_thread((p)->signal, t)
diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index 59d3736c454c..fb1e295e7e63 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; }
void arch_smt_update(void);
-#endif
+#endif /* _LINUX_SCHED_SMT_H */
diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h
index 3c3e049224ae..969aaf5ef9d6 100644
--- a/include/linux/sched/types.h
+++ b/include/linux/sched/types.h
@@ -20,4 +20,4 @@ struct task_cputime {
unsigned long long sum_exec_runtime;
};
-#endif
+#endif /* _LINUX_SCHED_TYPES_H */
diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
index 837a23624a66..bc60243d43b3 100644
--- a/include/linux/sched/vhost_task.h
+++ b/include/linux/sched/vhost_task.h
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_VHOST_TASK_H
-#define _LINUX_VHOST_TASK_H
-
+#ifndef _LINUX_SCHED_VHOST_TASK_H
+#define _LINUX_SCHED_VHOST_TASK_H
struct vhost_task;
@@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk);
void vhost_task_stop(struct vhost_task *vtsk);
void vhost_task_wake(struct vhost_task *vtsk);
-#endif
+#endif /* _LINUX_SCHED_VHOST_TASK_H */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index fea32377f7c7..52f5850730b3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int
#else
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
- return cpumask_nth(cpu, cpus);
+ return cpumask_nth_and(cpu, cpus, cpu_online_mask);
}
static inline const struct cpumask *
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index fbb99a61f714..a13d5d06be9d 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -735,6 +735,11 @@ DECLARE_TRACE(sched_update_nr_running_tp,
TP_PROTO(struct rq *rq, int change),
TP_ARGS(rq, change));
+DECLARE_TRACE(sched_compute_energy_tp,
+ TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
+ unsigned long max_util, unsigned long busy_time),
+ TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index f2c4589d4dbf..90662385689b 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -4,10 +4,6 @@
#include <linux/types.h>
-struct sched_param {
- int sched_priority;
-};
-
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 4fad0e6fca64..c450fa8b8b5e 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -71,7 +71,11 @@ bool __refrigerator(bool check_kthr_stop)
for (;;) {
bool freeze;
+ raw_spin_lock_irq(&current->pi_lock);
set_current_state(TASK_FROZEN);
+ /* unstale saved_state so that __thaw_task() will wake us up */
+ current->saved_state = TASK_RUNNING;
+ raw_spin_unlock_irq(&current->pi_lock);
spin_lock_irq(&freezer_lock);
freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop());
@@ -129,6 +133,7 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
WARN_ON_ONCE(debug_locks && p->lockdep_depth);
#endif
+ p->saved_state = p->__state;
WRITE_ONCE(p->__state, TASK_FROZEN);
return TASK_FROZEN;
}
@@ -170,42 +175,34 @@ bool freeze_task(struct task_struct *p)
}
/*
- * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical
- * state in p->jobctl. If either of them got a wakeup that was missed because
- * TASK_FROZEN, then their canonical state reflects that and the below will
- * refuse to restore the special state and instead issue the wakeup.
+ * Restore the saved_state before the task entered freezer. For typical task
+ * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens
+ * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state
+ * is restored unless they got an expected wakeup (see ttwu_state_match()).
+ * Returns 1 if the task state was restored.
*/
-static int __set_task_special(struct task_struct *p, void *arg)
+static int __restore_freezer_state(struct task_struct *p, void *arg)
{
- unsigned int state = 0;
+ unsigned int state = p->saved_state;
- if (p->jobctl & JOBCTL_TRACED)
- state = TASK_TRACED;
-
- else if (p->jobctl & JOBCTL_STOPPED)
- state = TASK_STOPPED;
-
- if (state)
+ if (state != TASK_RUNNING) {
WRITE_ONCE(p->__state, state);
+ return 1;
+ }
- return state;
+ return 0;
}
void __thaw_task(struct task_struct *p)
{
- unsigned long flags, flags2;
+ unsigned long flags;
spin_lock_irqsave(&freezer_lock, flags);
if (WARN_ON_ONCE(freezing(p)))
goto unlock;
- if (lock_task_sighand(p, &flags2)) {
- /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */
- bool ret = task_call_func(p, __set_task_special, NULL);
- unlock_task_sighand(p, &flags2);
- if (ret)
- goto unlock;
- }
+ if (task_call_func(p, __restore_freezer_state, NULL))
+ goto unlock;
wake_up_state(p, TASK_FROZEN);
unlock:
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 99bdd96f454f..80a3df49ab47 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -34,7 +34,6 @@
#include <linux/nospec.h>
#include <linux/proc_fs.h>
#include <linux/psi.h>
-#include <linux/psi.h>
#include <linux/ptrace_api.h>
#include <linux/sched_clock.h>
#include <linux/security.h>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 802551e0009b..65e10ac34660 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -85,7 +85,6 @@
#include "sched.h"
#include "stats.h"
-#include "autogroup.h"
#include "autogroup.h"
#include "pelt.h"
@@ -114,6 +113,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -919,14 +919,13 @@ static bool set_nr_if_polling(struct task_struct *p)
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) val = READ_ONCE(ti->flags);
- for (;;) {
+ do {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
- break;
- }
+ } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+
return true;
}
@@ -1480,16 +1479,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p)
static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
- struct rq_flags rf;
- struct rq *rq;
-
if (!rt_task(p))
return;
/* Protect updates to p->uclamp_* */
- rq = task_rq_lock(p, &rf);
+ guard(task_rq_lock)(p);
__uclamp_update_util_min_rt_default(p);
- task_rq_unlock(rq, p, &rf);
}
static inline struct uclamp_se
@@ -1785,9 +1780,8 @@ static void uclamp_update_root_tg(void)
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
- rcu_read_lock();
+ guard(rcu)();
cpu_util_update_eff(&root_task_group.css);
- rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
@@ -1814,10 +1808,9 @@ static void uclamp_sync_util_min_rt_default(void)
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
- rcu_read_lock();
+ guard(rcu)();
for_each_process_thread(g, p)
uclamp_update_util_min_rt_default(p);
- rcu_read_unlock();
}
static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
@@ -2218,10 +2211,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
p->sched_class->prio_changed(rq, p, oldprio);
}
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
- rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ rq->curr->sched_class->wakeup_preempt(rq, p, flags);
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
@@ -2239,31 +2232,21 @@ int __task_state_match(struct task_struct *p, unsigned int state)
if (READ_ONCE(p->__state) & state)
return 1;
-#ifdef CONFIG_PREEMPT_RT
if (READ_ONCE(p->saved_state) & state)
return -1;
-#endif
+
return 0;
}
static __always_inline
int task_state_match(struct task_struct *p, unsigned int state)
{
-#ifdef CONFIG_PREEMPT_RT
- int match;
-
/*
- * Serialize against current_save_and_set_rtlock_wait_state() and
- * current_restore_rtlock_saved_state().
+ * Serialize against current_save_and_set_rtlock_wait_state(),
+ * current_restore_rtlock_saved_state(), and __refrigerator().
*/
- raw_spin_lock_irq(&p->pi_lock);
- match = __task_state_match(p, state);
- raw_spin_unlock_irq(&p->pi_lock);
-
- return match;
-#else
+ guard(raw_spinlock_irq)(&p->pi_lock);
return __task_state_match(p, state);
-#endif
}
/*
@@ -2417,10 +2400,9 @@ void migrate_disable(void)
return;
}
- preempt_disable();
+ guard(preempt)();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
- preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
@@ -2444,7 +2426,7 @@ void migrate_enable(void)
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
- preempt_disable();
+ guard(preempt)();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &ac);
/*
@@ -2455,7 +2437,6 @@ void migrate_enable(void)
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
- preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -2527,7 +2508,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq_lock(rq, rf);
WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
return rq;
}
@@ -3409,7 +3390,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
- check_preempt_curr(dst_rq, p, 0);
+ wakeup_preempt(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
@@ -3516,13 +3497,11 @@ out:
*/
void kick_process(struct task_struct *p)
{
- int cpu;
+ guard(preempt)();
+ int cpu = task_cpu(p);
- preempt_disable();
- cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
- preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
@@ -3785,7 +3764,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
}
activate_task(rq, p, en_flags);
- check_preempt_curr(rq, p, wake_flags);
+ wakeup_preempt(rq, p, wake_flags);
ttwu_do_wakeup(p);
@@ -3856,7 +3835,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
* it should preempt the task that is current now.
*/
update_rq_clock(rq);
- check_preempt_curr(rq, p, wake_flags);
+ wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
ret = 1;
@@ -4036,13 +4015,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* The caller holds p::pi_lock if p != current or has preemption
* disabled when p == current.
*
- * The rules of PREEMPT_RT saved_state:
+ * The rules of saved_state:
*
* The related locking code always holds p::pi_lock when updating
* p::saved_state, which means the code is fully serialized in both cases.
*
- * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
- * bits set. This allows to distinguish all wakeup scenarios.
+ * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ * No other bits set. This allows to distinguish all wakeup scenarios.
+ *
+ * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ * allows us to prevent early wakeup of tasks before they can be run on
+ * asymmetric ISA architectures (eg ARMv9).
*/
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
@@ -4056,13 +4039,13 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
*success = !!(match = __task_state_match(p, state));
-#ifdef CONFIG_PREEMPT_RT
/*
* Saved state preserves the task state across blocking on
- * an RT lock. If the state matches, set p::saved_state to
- * TASK_RUNNING, but do not wake the task because it waits
- * for a lock wakeup. Also indicate success because from
- * the regular waker's point of view this has succeeded.
+ * an RT lock or TASK_FREEZABLE tasks. If the state matches,
+ * set p::saved_state to TASK_RUNNING, but do not wake the task
+ * because it waits for a lock wakeup or __thaw_task(). Also
+ * indicate success because from the regular waker's point of
+ * view this has succeeded.
*
* After acquiring the lock the task will restore p::__state
* from p::saved_state which ensures that the regular
@@ -4072,7 +4055,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
*/
if (match < 0)
p->saved_state = TASK_RUNNING;
-#endif
+
return match > 0;
}
@@ -4871,7 +4854,7 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
- check_preempt_curr(rq, p, WF_FORK);
+ wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
@@ -5916,8 +5899,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- && in_atomic_preempt_off()) {
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip);
}
@@ -6368,8 +6350,9 @@ static void sched_core_balance(struct rq *rq)
struct sched_domain *sd;
int cpu = cpu_of(rq);
- preempt_disable();
- rcu_read_lock();
+ guard(preempt)();
+ guard(rcu)();
+
raw_spin_rq_unlock_irq(rq);
for_each_domain(cpu, sd) {
if (need_resched())
@@ -6379,8 +6362,6 @@ static void sched_core_balance(struct rq *rq)
break;
}
raw_spin_rq_lock_irq(rq);
- rcu_read_unlock();
- preempt_enable();
}
static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
@@ -6730,12 +6711,10 @@ static inline void sched_submit_work(struct task_struct *tsk)
* If a worker goes to sleep, notify and ask workqueue whether it
* wants to wake up a task to maintain concurrency.
*/
- if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- if (task_flags & PF_WQ_WORKER)
- wq_worker_sleeping(tsk);
- else
- io_wq_worker_sleeping(tsk);
- }
+ if (task_flags & PF_WQ_WORKER)
+ wq_worker_sleeping(tsk);
+ else if (task_flags & PF_IO_WORKER)
+ io_wq_worker_sleeping(tsk);
/*
* spinlock and rwlock must not flush block requests. This will
@@ -7187,9 +7166,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
void set_user_nice(struct task_struct *p, long nice)
{
bool queued, running;
- int old_prio;
- struct rq_flags rf;
struct rq *rq;
+ int old_prio;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
@@ -7197,7 +7175,9 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- rq = task_rq_lock(p, &rf);
+ CLASS(task_rq_lock, rq_guard)(p);
+ rq = rq_guard.rq;
+
update_rq_clock(rq);
/*
@@ -7208,8 +7188,9 @@ void set_user_nice(struct task_struct *p, long nice)
*/
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
- goto out_unlock;
+ return;
}
+
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
@@ -7232,9 +7213,6 @@ void set_user_nice(struct task_struct *p, long nice)
* lowered its priority, then reschedule its CPU:
*/
p->sched_class->prio_changed(rq, p, old_prio);
-
-out_unlock:
- task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
@@ -7507,6 +7485,21 @@ static struct task_struct *find_process_by_pid(pid_t pid)
return pid ? find_task_by_vpid(pid) : current;
}
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
/*
* sched_setparam() passes in -1 for its policy, to let the functions
* it calls know not to change it.
@@ -7544,14 +7537,11 @@ static void __setscheduler_params(struct task_struct *p,
static bool check_same_owner(struct task_struct *p)
{
const struct cred *cred = current_cred(), *pcred;
- bool match;
+ guard(rcu)();
- rcu_read_lock();
pcred = __task_cred(p);
- match = (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
- rcu_read_unlock();
- return match;
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
}
/*
@@ -7963,27 +7953,17 @@ static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- struct task_struct *p;
- int retval;
if (!param || pid < 0)
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
-
- if (likely(p)) {
- retval = sched_setscheduler(p, policy, &lparam);
- put_task_struct(p);
- }
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- return retval;
+ return sched_setscheduler(p, policy, &lparam);
}
/*
@@ -8079,7 +8059,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
{
struct sched_attr attr;
- struct task_struct *p;
int retval;
if (!uattr || pid < 0 || flags)
@@ -8094,21 +8073,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
attr.sched_policy = SETPARAM_POLICY;
- rcu_read_lock();
- retval = -ESRCH;
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
- if (likely(p)) {
- if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
- retval = sched_setattr(p, &attr);
- put_task_struct(p);
- }
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
- return retval;
+ return sched_setattr(p, &attr);
}
/**
@@ -8126,16 +8098,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
+ guard(rcu)();
p = find_process_by_pid(pid);
- if (p) {
- retval = security_task_getscheduler(p);
- if (!retval)
- retval = p->policy
- | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
}
- rcu_read_unlock();
return retval;
}
@@ -8156,30 +8129,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (!param || pid < 0)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- rcu_read_unlock();
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
/*
* This one might sleep, we cannot do it with a spinlock held ...
*/
- retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
- return retval;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
/*
@@ -8239,46 +8205,38 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- retval = -ESRCH;
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- kattr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
- kattr.sched_flags &= SCHED_FLAG_ALL;
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
- /*
- * This could race with another potential updater, but this is fine
- * because it'll correctly read the old or the new value. We don't need
- * to guarantee who wins the race as long as it doesn't return garbage.
- */
- kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
-
- rcu_read_unlock();
+ }
return sched_attr_copy_to_user(uattr, &kattr, usize);
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
- int ret = 0;
-
/*
* If the task isn't a deadline task or admission control is
* disabled then we don't care about affinity changes.
@@ -8292,11 +8250,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
* tasks allowed to run on all the CPUs in the task's
* root_domain.
*/
- rcu_read_lock();
+ guard(rcu)();
if (!cpumask_subset(task_rq(p)->rd->span, mask))
- ret = -EBUSY;
- rcu_read_unlock();
- return ret;
+ return -EBUSY;
+
+ return 0;
}
#endif
@@ -8366,39 +8324,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
struct affinity_context ac;
struct cpumask *user_mask;
- struct task_struct *p;
int retval;
- rcu_read_lock();
-
- p = find_process_by_pid(pid);
- if (!p) {
- rcu_read_unlock();
+ CLASS(find_get_task, p)(pid);
+ if (!p)
return -ESRCH;
- }
- /* Prevent p going away */
- get_task_struct(p);
- rcu_read_unlock();
-
- if (p->flags & PF_NO_SETAFFINITY) {
- retval = -EINVAL;
- goto out_put_task;
- }
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
if (!check_same_owner(p)) {
- rcu_read_lock();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- rcu_read_unlock();
- retval = -EPERM;
- goto out_put_task;
- }
- rcu_read_unlock();
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_put_task;
+ return retval;
/*
* With non-SMP configs, user_cpus_ptr/user_mask isn't used and
@@ -8408,8 +8351,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
if (user_mask) {
cpumask_copy(user_mask, in_mask);
} else if (IS_ENABLED(CONFIG_SMP)) {
- retval = -ENOMEM;
- goto out_put_task;
+ return -ENOMEM;
}
ac = (struct affinity_context){
@@ -8421,8 +8363,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = __sched_setaffinity(p, &ac);
kfree(ac.user_mask);
-out_put_task:
- put_task_struct(p);
return retval;
}
@@ -8464,28 +8404,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
- unsigned long flags;
int retval;
- rcu_read_lock();
-
- retval = -ESRCH;
+ guard(rcu)();
p = find_process_by_pid(pid);
if (!p)
- goto out_unlock;
+ return -ESRCH;
retval = security_task_getscheduler(p);
if (retval)
- goto out_unlock;
+ return retval;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-out_unlock:
- rcu_read_unlock();
-
- return retval;
+ return 0;
}
/**
@@ -8932,55 +8865,46 @@ int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
- unsigned long flags;
int yielded = 0;
- local_irq_save(flags);
- rq = this_rq();
+ scoped_guard (irqsave) {
+ rq = this_rq();
again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1) {
- yielded = -ESRCH;
- goto out_irq;
- }
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
- double_rq_lock(rq, p_rq);
- if (task_rq(p) != p_rq) {
- double_rq_unlock(rq, p_rq);
- goto again;
- }
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
- if (!curr->sched_class->yield_to_task)
- goto out_unlock;
+ if (!curr->sched_class->yield_to_task)
+ return 0;
- if (curr->sched_class != p->sched_class)
- goto out_unlock;
+ if (curr->sched_class != p->sched_class)
+ return 0;
- if (task_on_cpu(p_rq, p) || !task_is_running(p))
- goto out_unlock;
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
- yielded = curr->sched_class->yield_to_task(rq, p);
- if (yielded) {
- schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity takes care of
- * fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
}
-out_unlock:
- double_rq_unlock(rq, p_rq);
-out_irq:
- local_irq_restore(flags);
-
- if (yielded > 0)
+ if (yielded)
schedule();
return yielded;
@@ -9083,38 +9007,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
- struct task_struct *p;
- unsigned int time_slice;
- struct rq_flags rf;
- struct rq *rq;
+ unsigned int time_slice = 0;
int retval;
if (pid < 0)
return -EINVAL;
- retval = -ESRCH;
- rcu_read_lock();
- p = find_process_by_pid(pid);
- if (!p)
- goto out_unlock;
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
- retval = security_task_getscheduler(p);
- if (retval)
- goto out_unlock;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
- rq = task_rq_lock(p, &rf);
- time_slice = 0;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &rf);
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
- rcu_read_unlock();
jiffies_to_timespec64(time_slice, t);
return 0;
-
-out_unlock:
- rcu_read_unlock();
- return retval;
}
/**
@@ -10498,17 +10414,18 @@ void sched_move_task(struct task_struct *tsk)
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct task_group *group;
- struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(tsk, &rf);
+ CLASS(task_rq_lock, rq_guard)(tsk);
+ rq = rq_guard.rq;
+
/*
* Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
* group changes.
*/
group = sched_get_task_group(tsk);
if (group == tsk->sched_task_group)
- goto unlock;
+ return;
update_rq_clock(rq);
@@ -10533,9 +10450,6 @@ void sched_move_task(struct task_struct *tsk)
*/
resched_curr(rq);
}
-
-unlock:
- task_rq_unlock(rq, tsk, &rf);
}
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -10572,11 +10486,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
#ifdef CONFIG_UCLAMP_TASK_GROUP
/* Propagate the effective uclamp value for the new group */
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
cpu_util_update_eff(css);
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
#endif
return 0;
@@ -10727,8 +10639,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
static_branch_enable(&sched_uclamp_used);
- mutex_lock(&uclamp_mutex);
- rcu_read_lock();
+ guard(mutex)(&uclamp_mutex);
+ guard(rcu)();
tg = css_tg(of_css(of));
if (tg->uclamp_req[clamp_id].value != req.util)
@@ -10743,9 +10655,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
/* Update effective clamps to track the most restrictive value */
cpu_util_update_eff(of_css(of));
- rcu_read_unlock();
- mutex_unlock(&uclamp_mutex);
-
return nbytes;
}
@@ -10771,10 +10680,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf,
u64 percent;
u32 rem;
- rcu_read_lock();
- tg = css_tg(seq_css(sf));
- util_clamp = tg->uclamp_req[clamp_id].value;
- rcu_read_unlock();
+ scoped_guard (rcu) {
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ }
if (util_clamp == SCHED_CAPACITY_SCALE) {
seq_puts(sf, "max\n");
@@ -10865,11 +10774,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- cpus_read_lock();
- mutex_lock(&cfs_constraints_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&cfs_constraints_mutex);
+
ret = __cfs_schedulable(tg, period, quota);
if (ret)
- goto out_unlock;
+ return ret;
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -10879,39 +10789,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
*/
if (runtime_enabled && !runtime_was_enabled)
cfs_bandwidth_usage_inc();
- raw_spin_lock_irq(&cfs_b->lock);
- cfs_b->period = ns_to_ktime(period);
- cfs_b->quota = quota;
- cfs_b->burst = burst;
- __refill_cfs_bandwidth_runtime(cfs_b);
+ scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+ cfs_b->burst = burst;
- /* Restart the period timer (if active) to handle new period expiry: */
- if (runtime_enabled)
- start_cfs_bandwidth(cfs_b);
+ __refill_cfs_bandwidth_runtime(cfs_b);
- raw_spin_unlock_irq(&cfs_b->lock);
+ /*
+ * Restart the period timer (if active) to handle new
+ * period expiry:
+ */
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
+ }
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
- struct rq_flags rf;
- rq_lock_irq(rq, &rf);
+ guard(rq_lock_irq)(rq);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- rq_unlock_irq(rq, &rf);
}
+
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
-out_unlock:
- mutex_unlock(&cfs_constraints_mutex);
- cpus_read_unlock();
- return ret;
+ return 0;
}
static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@ -11096,7 +11005,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
- int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
@@ -11108,11 +11016,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
do_div(data.quota, NSEC_PER_USEC);
}
- rcu_read_lock();
- ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
}
static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@ -11717,14 +11622,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
* are not the last task to be migrated from this cpu for this mm, so
* there is no need to move src_cid to the destination cpu.
*/
- rcu_read_lock();
+ guard(rcu)();
src_task = rcu_dereference(src_rq->curr);
if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- rcu_read_unlock();
t->last_mm_cid = -1;
return -1;
}
- rcu_read_unlock();
return src_cid;
}
@@ -11768,18 +11671,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
* the lazy-put flag, this task will be responsible for transitioning
* from lazy-put flag set to MM_CID_UNSET.
*/
- rcu_read_lock();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- rcu_read_unlock();
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
+ scoped_guard (rcu) {
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
}
- rcu_read_unlock();
/*
* The src_cid is unused, so it can be unset.
@@ -11852,7 +11754,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
- unsigned long flags;
int cid, lazy_cid;
cid = READ_ONCE(pcpu_cid->cid);
@@ -11887,23 +11788,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
* the lazy-put flag, that task will be responsible for transitioning
* from lazy-put flag set to MM_CID_UNSET.
*/
- rcu_read_lock();
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
- rcu_read_unlock();
- return;
+ scoped_guard (rcu) {
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+ return;
}
- rcu_read_unlock();
/*
* The cid is unused, so it can be unset.
* Disable interrupts to keep the window of cid ownership without rq
* lock small.
*/
- local_irq_save(flags);
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
- local_irq_restore(flags);
+ scoped_guard (irqsave) {
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
+ }
}
static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
@@ -11925,14 +11824,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
* snapshot associated with this cid if an active task using the mm is
* observed on this rq.
*/
- rcu_read_lock();
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- rcu_read_unlock();
- return;
+ scoped_guard (rcu) {
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ WRITE_ONCE(pcpu_cid->time, rq_clock);
+ return;
+ }
}
- rcu_read_unlock();
if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
return;
@@ -12026,7 +11924,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
void sched_mm_cid_exit_signals(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -12034,7 +11931,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
preempt_enable_no_resched(); /* holding spinlock */
WRITE_ONCE(t->mm_cid_active, 0);
/*
@@ -12044,13 +11941,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
- rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_before_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -12058,7 +11953,7 @@ void sched_mm_cid_before_execve(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
+ guard(rq_lock_irqsave)(rq);
preempt_enable_no_resched(); /* holding spinlock */
WRITE_ONCE(t->mm_cid_active, 0);
/*
@@ -12068,13 +11963,11 @@ void sched_mm_cid_before_execve(struct task_struct *t)
smp_mb();
mm_cid_put(mm);
t->last_mm_cid = t->mm_cid = -1;
- rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_after_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq_flags rf;
struct rq *rq;
if (!mm)
@@ -12082,16 +11975,16 @@ void sched_mm_cid_after_execve(struct task_struct *t)
preempt_disable();
rq = this_rq();
- rq_lock_irqsave(rq, &rf);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
- rq_unlock_irqrestore(rq, &rf);
+ scoped_guard (rq_lock_irqsave, rq) {
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 1);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ }
rseq_set_notify_resume(t);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 58b542bf2893..d98408a274e5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -509,7 +509,6 @@ void init_dl_rq(struct dl_rq *dl_rq)
/* zero means no -deadline tasks */
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
- dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
#else
@@ -553,39 +552,6 @@ static inline void dl_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
}
-static void update_dl_migration(struct dl_rq *dl_rq)
-{
- if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
- if (!dl_rq->overloaded) {
- dl_set_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 1;
- }
- } else if (dl_rq->overloaded) {
- dl_clear_overload(rq_of_dl_rq(dl_rq));
- dl_rq->overloaded = 0;
- }
-}
-
-static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory++;
-
- update_dl_migration(dl_rq);
-}
-
-static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
- struct task_struct *p = dl_task_of(dl_se);
-
- if (p->nr_cpus_allowed > 1)
- dl_rq->dl_nr_migratory--;
-
- update_dl_migration(dl_rq);
-}
-
#define __node_2_pdl(node) \
rb_entry((node), struct task_struct, pushable_dl_tasks)
@@ -594,6 +560,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b)
return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl);
}
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+ return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+}
+
/*
* The list of pushable -deadline task is not a plist, like in
* sched_rt.c, it is an rb-tree with tasks ordered by deadline.
@@ -609,6 +580,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
__pushable_less);
if (leftmost)
rq->dl.earliest_dl.next = p->dl.deadline;
+
+ if (!rq->dl.overloaded) {
+ dl_set_overload(rq);
+ rq->dl.overloaded = 1;
+ }
}
static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
@@ -625,11 +601,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline;
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-}
-static inline int has_pushable_dl_tasks(struct rq *rq)
-{
- return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root);
+ if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) {
+ dl_clear_overload(rq);
+ rq->dl.overloaded = 0;
+ }
}
static int push_dl_task(struct rq *rq);
@@ -763,7 +739,7 @@ static inline void deadline_queue_pull_task(struct rq *rq)
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
struct rq *rq)
@@ -1175,7 +1151,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
@@ -1504,7 +1480,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
- inc_dl_migration(dl_se, dl_rq);
}
static inline
@@ -1518,7 +1493,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
- dec_dl_migration(dl_se, dl_rq);
}
static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
@@ -1939,7 +1913,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
int flags)
{
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
@@ -2291,9 +2265,6 @@ static int push_dl_task(struct rq *rq)
struct rq *later_rq;
int ret = 0;
- if (!rq->dl.overloaded)
- return 0;
-
next_task = pick_next_pushable_dl_task(rq);
if (!next_task)
return 0;
@@ -2652,7 +2623,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
deadline_queue_push_tasks(rq);
#endif
if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
} else {
@@ -2721,7 +2692,7 @@ DEFINE_SCHED_CLASS(dl) = {
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
- .check_preempt_curr = check_preempt_curr_dl,
+ .wakeup_preempt = wakeup_preempt_dl,
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db6..4580a450700e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,7 +8,7 @@
*/
/*
- * This allows printing both to /proc/sched_debug and
+ * This allows printing both to /sys/kernel/debug/sched/debug and
* to the console
*/
#define SEQ_printf(m, x...) \
@@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
PU(rt_nr_running);
-#ifdef CONFIG_SMP
- PU(rt_nr_migratory);
-#endif
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
@@ -748,7 +745,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
PU(dl_nr_running);
#ifdef CONFIG_SMP
- PU(dl_nr_migratory);
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
#else
dl_bw = &dl_rq->dl_bw;
@@ -864,7 +860,6 @@ static void sched_debug_header(struct seq_file *m)
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
PN(sysctl_sched_base_slice);
- P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
#undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef7490c4b8b4..52c498fd6c46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,12 +78,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
unsigned int sysctl_sched_base_slice = 750000ULL;
static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
int sched_thermal_decay_shift;
@@ -145,13 +139,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
#ifdef CONFIG_SYSCTL
static struct ctl_table sched_fair_sysctls[] = {
- {
- .procname = "sched_child_runs_first",
- .data = &sysctl_sched_child_runs_first,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_CFS_BANDWIDTH
{
.procname = "sched_cfs_bandwidth_slice_us",
@@ -2855,19 +2842,7 @@ static void task_numa_placement(struct task_struct *p)
}
/* Cannot migrate task to CPU-less node */
- if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
- int near_nid = max_nid;
- int distance, near_distance = INT_MAX;
-
- for_each_node_state(nid, N_CPU) {
- distance = node_distance(max_nid, nid);
- if (distance < near_distance) {
- near_nid = nid;
- near_distance = distance;
- }
- }
- max_nid = near_nid;
- }
+ max_nid = numa_nearest_node(max_nid, N_CPU);
if (ng) {
numa_group_count_active_nodes(ng);
@@ -3896,7 +3871,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
- long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ long delta;
+ u64 now;
/*
* No need to update load_avg for root_task_group as it is not used.
@@ -3904,9 +3880,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
if (cfs_rq->tg == &root_task_group)
return;
+ /*
+ * For migration heavy workloads, access to tg->load_avg can be
+ * unbound. Limit the update rate to at most once per ms.
+ */
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+ return;
+
+ delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+ cfs_rq->last_update_tg_load_avg = now;
}
}
@@ -4580,22 +4566,6 @@ static inline unsigned long task_util_est(struct task_struct *p)
return max(task_util(p), _task_util_est(p));
}
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p,
- unsigned long uclamp_min,
- unsigned long uclamp_max)
-{
- return clamp(task_util_est(p), uclamp_min, uclamp_max);
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p,
- unsigned long uclamp_min,
- unsigned long uclamp_max)
-{
- return task_util_est(p);
-}
-#endif
-
static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
struct task_struct *p)
{
@@ -4886,7 +4856,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
- return true;
+ return !cfs_rq->nr_running;
}
#define UPDATE_TG 0x0
@@ -5765,13 +5735,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
{
- struct cfs_rq *local_unthrottle = NULL;
int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
bool throttled = false;
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *tmp;
struct rq_flags rf;
struct rq *rq;
+ LIST_HEAD(local_unthrottle);
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5787,11 +5757,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
if (!cfs_rq_throttled(cfs_rq))
goto next;
-#ifdef CONFIG_SMP
/* Already queued for async unthrottle */
if (!list_empty(&cfs_rq->throttled_csd_list))
goto next;
-#endif
/* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
@@ -5808,11 +5776,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0) {
- if (cpu_of(rq) != this_cpu ||
- SCHED_WARN_ON(local_unthrottle))
+ if (cpu_of(rq) != this_cpu) {
unthrottle_cfs_rq_async(cfs_rq);
- else
- local_unthrottle = cfs_rq;
+ } else {
+ /*
+ * We currently only expect to be unthrottling
+ * a single cfs_rq locally.
+ */
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ list_add_tail(&cfs_rq->throttled_csd_list,
+ &local_unthrottle);
+ }
} else {
throttled = true;
}
@@ -5820,15 +5794,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
next:
rq_unlock_irqrestore(rq, &rf);
}
- rcu_read_unlock();
- if (local_unthrottle) {
- rq = cpu_rq(this_cpu);
+ list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+ throttled_csd_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
rq_lock_irqsave(rq, &rf);
- if (cfs_rq_throttled(local_unthrottle))
- unthrottle_cfs_rq(local_unthrottle);
+
+ list_del_init(&cfs_rq->throttled_csd_list);
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+
rq_unlock_irqrestore(rq, &rf);
}
+ SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
+ rcu_read_unlock();
return throttled;
}
@@ -6158,9 +6140,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
-#ifdef CONFIG_SMP
INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-#endif
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7631,11 +7611,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
{
unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
unsigned long busy_time = eenv->pd_busy_time;
+ unsigned long energy;
if (dst_cpu >= 0)
busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
- return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+ energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+ trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+ return energy;
}
/*
@@ -7710,7 +7695,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = prev_cpu;
sync_entity_load_avg(&p->se);
- if (!uclamp_task_util(p, p_util_min, p_util_max))
+ if (!task_util_est(p) && p_util_min == 0)
goto unlock;
eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -7718,11 +7703,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
unsigned long cpu_cap, cpu_thermal_cap, util;
- unsigned long cur_delta, max_spare_cap = 0;
+ long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
- unsigned long prev_spare_cap = 0;
+ unsigned long cur_delta, base_energy;
int max_spare_cap_cpu = -1;
- unsigned long base_energy;
int fits, max_fits = -1;
cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7785,7 +7769,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
prev_spare_cap = cpu_cap;
prev_fits = fits;
} else if ((fits > max_fits) ||
- ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
+ ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
/*
* Find the CPU with the maximum spare capacity
* among the remaining CPUs in the performance
@@ -7797,7 +7781,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
}
- if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
+ if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
continue;
eenv_pd_busy_time(&eenv, cpus, p);
@@ -7805,7 +7789,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */
- if (prev_spare_cap > 0) {
+ if (prev_spare_cap > -1) {
prev_delta = compute_energy(&eenv, pd, cpus, p,
prev_cpu);
/* CPU utilization has changed */
@@ -8006,7 +7990,7 @@ static void set_next_buddy(struct sched_entity *se)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
@@ -8019,7 +8003,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
/*
* This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_preempt_curr() after an enqueue (which may have
+ * unconditionally wakeup_preempt() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
*/
@@ -8926,7 +8910,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
WARN_ON_ONCE(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
/*
@@ -12401,7 +12385,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (p->prio > oldprio)
resched_curr(rq);
} else
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12503,7 +12487,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
if (task_current(rq, p))
resched_curr(rq);
else
- check_preempt_curr(rq, p, 0);
+ wakeup_preempt(rq, p, 0);
}
}
@@ -12862,7 +12846,7 @@ DEFINE_SCHED_CLASS(fair) = {
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .check_preempt_curr = check_preempt_wakeup,
+ .wakeup_preempt = check_preempt_wakeup_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 5007b25c5bc6..565f8374ddbb 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -401,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Idle tasks are unconditionally rescheduled:
*/
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
{
resched_curr(rq);
}
@@ -482,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = {
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
- .check_preempt_curr = check_preempt_curr_idle,
+ .wakeup_preempt = wakeup_preempt_idle,
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0597ba0f85ff..88fc98601413 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -37,6 +37,8 @@ static struct ctl_table sched_rt_sysctls[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_rt_runtime_us",
@@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rt_handler,
+ .extra1 = SYSCTL_NEG_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_rr_timeslice_ms",
@@ -143,7 +147,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
- rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
@@ -358,53 +361,6 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
-
- update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
-}
-
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
@@ -438,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -451,6 +412,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
rq->rt.highest_prio.next = p->prio;
} else {
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
}
}
@@ -464,16 +430,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
@@ -953,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
/*
* When we're idle and a woken (rt) task is
- * throttled check_preempt_curr() will set
+ * throttled wakeup_preempt() will set
* skip_update and the time between the wakeup
* and this unthrottle will get accounted as
* 'runtime'.
@@ -1281,7 +1237,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}
@@ -1294,7 +1249,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
@@ -1715,7 +1669,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
resched_curr(rq);
@@ -2702,7 +2656,7 @@ DEFINE_SCHED_CLASS(rt) = {
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
- .check_preempt_curr = check_preempt_curr_rt,
+ .wakeup_preempt = wakeup_preempt_rt,
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
@@ -2985,9 +2939,6 @@ static int sched_rt_global_constraints(void)
#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
((u64)sysctl_sched_rt_runtime *
@@ -3018,7 +2969,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
ret = sched_rt_global_validate();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04846272409c..649eb9ec0657 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -74,15 +74,6 @@
#include "../workqueue_internal.h"
-#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-#include <linux/psi.h>
-#endif
-
-#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
-#endif
-
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
# include <asm/paravirt_api_clock.h>
@@ -109,8 +100,6 @@ extern __read_mostly int scheduler_running;
extern unsigned long calc_load_update;
extern atomic_long_t calc_load_tasks;
-extern unsigned int sysctl_sched_child_runs_first;
-
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
@@ -594,6 +583,7 @@ struct cfs_rq {
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
+ u64 last_update_tg_load_avg;
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
@@ -644,9 +634,7 @@ struct cfs_rq {
int throttled;
int throttle_count;
struct list_head throttled_list;
-#ifdef CONFIG_SMP
struct list_head throttled_csd_list;
-#endif
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -675,8 +663,6 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned int rt_nr_migratory;
- unsigned int rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
@@ -721,7 +707,6 @@ struct dl_rq {
u64 next;
} earliest_dl;
- unsigned int dl_nr_migratory;
int overloaded;
/*
@@ -1658,6 +1643,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
+DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
+ _T->rq = task_rq_lock(_T->lock, &_T->rf),
+ task_rq_unlock(_T->rq, _T->lock, &_T->rf),
+ struct rq *rq; struct rq_flags rf)
+
static inline void
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
@@ -2239,7 +2229,7 @@ struct sched_class {
void (*yield_task) (struct rq *rq);
bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
- void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
+ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
struct task_struct *(*pick_next_task)(struct rq *rq);
@@ -2513,7 +2503,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
-extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
#ifdef CONFIG_PREEMPT_RT
#define SCHED_NR_MIGRATE_BREAK 8
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 85590599b4d6..6cf7304e6449 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#endif /* CONFIG_SMP */
static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
{
/* we're never preempted */
}
@@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = {
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,
- .check_preempt_curr = check_preempt_curr_stop,
+ .wakeup_preempt = wakeup_preempt_stop,
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05a5bc678c08..a7b50bba7829 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2112,22 +2112,31 @@ static int hop_cmp(const void *a, const void *b)
return -1;
}
-/*
- * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
- * closest to @cpu from @cpumask.
- * cpumask: cpumask to find a cpu from
- * cpu: Nth cpu to find
- *
- * returns: cpu, or nr_cpu_ids when nothing found.
+/**
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU
+ * from @cpus to @cpu, taking into account distance
+ * from a given @node.
+ * @cpus: cpumask to find a cpu from
+ * @cpu: CPU to start searching
+ * @node: NUMA node to order CPUs by distance
+ *
+ * Return: cpu, or nr_cpu_ids when nothing found.
*/
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
- struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
+ struct __cmp_key k = { .cpus = cpus, .cpu = cpu };
struct cpumask ***hop_masks;
int hop, ret = nr_cpu_ids;
+ if (node == NUMA_NO_NODE)
+ return cpumask_nth_and(cpu, cpus, cpu_online_mask);
+
rcu_read_lock();
+ /* CPU-less node entries are uninitialized in sched_domains_numa_masks */
+ node = numa_nearest_node(node, N_CPU);
+ k.node = node;
+
k.masks = rcu_dereference(sched_domains_numa_masks);
if (!k.masks)
goto unlock;
diff --git a/lib/cpumask.c b/lib/cpumask.c
index a7fd02b5ae26..34335c1e7265 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -146,9 +146,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
/* Wrap: we always want a cpu. */
i %= num_online_cpus();
- cpu = (node == NUMA_NO_NODE) ?
- cpumask_nth(i, cpu_online_mask) :
- sched_numa_find_nth_cpu(cpu_online_mask, i, node);
+ cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node);
WARN_ON(cpu >= nr_cpu_ids);
return cpu;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1b00d6ac7ee..5051ae3eb3fc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,22 +131,26 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/**
- * numa_map_to_online_node - Find closest online node
+ * numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
+ * @state: State to filter the search
*
- * Lookup the next closest node by distance if @nid is not online.
+ * Lookup the closest node by distance if @nid is not in state.
*
- * Return: this @node if it is online, otherwise the closest node by distance
+ * Return: this @node if it is in state, otherwise the closest node by distance
*/
-int numa_map_to_online_node(int node)
+int numa_nearest_node(int node, unsigned int state)
{
int min_dist = INT_MAX, dist, n, min_node;
- if (node == NUMA_NO_NODE || node_online(node))
+ if (state >= NR_NODE_STATES)
+ return -EINVAL;
+
+ if (node == NUMA_NO_NODE || node_state(node, state))
return node;
min_node = node;
- for_each_online_node(n) {
+ for_each_node_state(n, state) {
dist = node_distance(node, n);
if (dist < min_dist) {
min_dist = dist;
@@ -156,7 +160,7 @@ int numa_map_to_online_node(int node)
return min_node;
}
-EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+EXPORT_SYMBOL_GPL(numa_nearest_node);
struct mempolicy *get_task_policy(struct task_struct *p)
{