Merge tag 'cgroup-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo: - Whenever cpuset needs to rebuild sched_domain, it walked all tasks looking for DEADLINE tasks as they need to be accounted on the new domain. Walking all tasks can be expensive and there may not be any DEADLINE tasks at all. Task iteration is now omitted if there are no DEADLINE tasks - Fixes DEADLINE bandwidth misaccounting after task migration failures - When no controller is enabled, -Wstringop-overflow warning is triggered. The fix patch added an early exit which is too eager and got reverted for now. Will fix later - Everything else is minor cleanups * tag 'cgroup-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: Revert "cgroup: Avoid -Wstringop-overflow warnings" cgroup/misc: Expose misc.current on cgroup v2 root cgroup: Avoid -Wstringop-overflow warnings cgroup: remove obsolete comment on cgroup_on_dfl() cgroup: remove unused task_cgroup_path() cgroup/cpuset: remove unneeded header files cgroup: make cgroup_is_threaded() and cgroup_is_thread_root() static rdmacg: fix kernel-doc warnings in rdmacg cgroup: Replace the css_set call with cgroup_get cgroup: remove unused macro for_each_e_css() cgroup: Update out-of-date comment in cgroup_migrate() cgroup: Replace all non-returning strlcpy with strscpy cgroup/cpuset: remove unneeded header files cgroup/cpuset: Free DL BW in case can_attach() fails sched/deadline: Create DL BW alloc, free & check overflow interface cgroup/cpuset: Iterate only if DEADLINE tasks are present sched/cpuset: Keep track of SCHED_DEADLINE task in cpusets sched/cpuset: Bring back cpuset_mutex cgroup/cpuset: Rename functions dealing with DEADLINE accounting
author: Linus Torvalds <torvalds@linux-foundation.org> 2023-06-27 16:54:21 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2023-06-27 16:54:21 -0700
commit: 6e2332e0ab532eb01f1fde39080dbfa9bf65c4cf (patch)
tree: ebdbcd8403d9818b998b90c586c33879546fc6a8 /kernel/sched
parent: 72dc6db7e3b692f46f3386b8dd5101d3f431adef (diff)
parent: 81621430c81bb7965c3d5807039bc2b5b3ec87ca (diff)
download: linux-6e2332e0ab532eb01f1fde39080dbfa9bf65c4cf.tar.gz
linux-6e2332e0ab532eb01f1fde39080dbfa9bf65c4cf.tar.bz2
linux-6e2332e0ab532eb01f1fde39080dbfa9bf65c4cf.zip
3 files changed, 73 insertions, 37 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 23203daf7128..c52c2eba7c73 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7631,6 +7631,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	struct rq *rq;
+	bool cpuset_locked = false;
 
 	/* The pi code expects interrupts enabled */
 	BUG_ON(pi && in_interrupt());
@@ -7680,8 +7681,14 @@ recheck:
 			return retval;
 	}
 
-	if (pi)
-		cpuset_read_lock();
+	/*
+	 * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+	 * information.
+	 */
+	if (dl_policy(policy) || dl_policy(p->policy)) {
+		cpuset_locked = true;
+		cpuset_lock();
+	}
 
 	/*
 	 * Make sure no PI-waiters arrive (or leave) while we are
@@ -7757,8 +7764,8 @@ change:
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
 		task_rq_unlock(rq, p, &rf);
-		if (pi)
-			cpuset_read_unlock();
+		if (cpuset_locked)
+			cpuset_unlock();
 		goto recheck;
 	}
 
@@ -7825,7 +7832,8 @@ change:
 	task_rq_unlock(rq, p, &rf);
 
 	if (pi) {
-		cpuset_read_unlock();
+		if (cpuset_locked)
+			cpuset_unlock();
 		rt_mutex_adjust_pi(p);
 	}
 
@@ -7837,8 +7845,8 @@ change:
 
 unlock:
 	task_rq_unlock(rq, p, &rf);
-	if (pi)
-		cpuset_read_unlock();
+	if (cpuset_locked)
+		cpuset_unlock();
 	return retval;
 }
 
@@ -9327,8 +9335,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
 	return ret;
 }
 
-int task_can_attach(struct task_struct *p,
-		    const struct cpumask *cs_effective_cpus)
+int task_can_attach(struct task_struct *p)
 {
 	int ret = 0;
 
@@ -9341,21 +9348,9 @@ int task_can_attach(struct task_struct *p,
 	 * success of set_cpus_allowed_ptr() on all attached tasks
 	 * before cpus_mask may be changed.
 	 */
-	if (p->flags & PF_NO_SETAFFINITY) {
+	if (p->flags & PF_NO_SETAFFINITY)
 		ret = -EINVAL;
-		goto out;
-	}
 
-	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
-					      cs_effective_cpus)) {
-		int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
-
-		if (unlikely(cpu >= nr_cpu_ids))
-			return -EINVAL;
-		ret = dl_cpu_busy(cpu, p);
-	}
-
-out:
 	return ret;
 }
 
@@ -9638,7 +9633,7 @@ static void cpuset_cpu_active(void)
 static int cpuset_cpu_inactive(unsigned int cpu)
 {
 	if (!cpuhp_tasks_frozen) {
-		int ret = dl_cpu_busy(cpu, NULL);
+		int ret = dl_bw_check_overflow(cpu);
 
 		if (ret)
 			return ret;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e41a36bd66a6..58b542bf2893 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,8 @@
  *                    Fabio Checconi <fchecconi@gmail.com>
  */
 
+#include <linux/cpuset.h>
+
 /*
  * Default limits for DL period; on the top end we guard against small util
  * tasks still getting ridiculously long effective runtimes, on the bottom end we
@@ -2585,6 +2587,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (task_on_rq_queued(p) && p->dl.dl_runtime)
 		task_non_contending(p);
 
+	/*
+	 * In case a task is setscheduled out from SCHED_DEADLINE we need to
+	 * keep track of that on its cpuset (for correct bandwidth tracking).
+	 */
+	dec_dl_tasks_cs(p);
+
 	if (!task_on_rq_queued(p)) {
 		/*
 		 * Inactive timer is armed. However, p is leaving DEADLINE and
@@ -2625,6 +2633,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 	if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
 		put_task_struct(p);
 
+	/*
+	 * In case a task is setscheduled to SCHED_DEADLINE we need to keep
+	 * track of that on its cpuset (for correct bandwidth tracking).
+	 */
+	inc_dl_tasks_cs(p);
+
 	/* If p is not queued we will update its parameters at next wakeup. */
 	if (!task_on_rq_queued(p)) {
 		add_rq_bw(&p->dl, &rq->dl);
@@ -3033,26 +3047,38 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
 	return ret;
 }
 
-int dl_cpu_busy(int cpu, struct task_struct *p)
+enum dl_bw_request {
+	dl_bw_req_check_overflow = 0,
+	dl_bw_req_alloc,
+	dl_bw_req_free
+};
+
+static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
 {
-	unsigned long flags, cap;
+	unsigned long flags;
 	struct dl_bw *dl_b;
-	bool overflow;
+	bool overflow = 0;
 
 	rcu_read_lock_sched();
 	dl_b = dl_bw_of(cpu);
 	raw_spin_lock_irqsave(&dl_b->lock, flags);
-	cap = dl_bw_capacity(cpu);
-	overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
 
-	if (!overflow && p) {
-		/*
-		 * We reserve space for this task in the destination
-		 * root_domain, as we can't fail after this point.
-		 * We will free resources in the source root_domain
-		 * later on (see set_cpus_allowed_dl()).
-		 */
-		__dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+	if (req == dl_bw_req_free) {
+		__dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
+	} else {
+		unsigned long cap = dl_bw_capacity(cpu);
+
+		overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
+
+		if (req == dl_bw_req_alloc && !overflow) {
+			/*
+			 * We reserve space in the destination
+			 * root_domain, as we can't fail after this point.
+			 * We will free resources in the source root_domain
+			 * later on (see set_cpus_allowed_dl()).
+			 */
+			__dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
+		}
 	}
 
 	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@@ -3060,6 +3086,21 @@ int dl_cpu_busy(int cpu, struct task_struct *p)
 
 	return overflow ? -EBUSY : 0;
 }
+
+int dl_bw_check_overflow(int cpu)
+{
+	return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
+}
+
+int dl_bw_alloc(int cpu, u64 dl_bw)
+{
+	return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
+}
+
+void dl_bw_free(int cpu, u64 dl_bw)
+{
+	dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
+}
 #endif
 
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 50d4b61aef3a..e93e006a942b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -324,7 +324,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
 extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int  dl_cpu_busy(int cpu, struct task_struct *p);
+extern int  dl_bw_check_overflow(int cpu);
 
 #ifdef CONFIG_CGROUP_SCHED
author	Linus Torvalds <torvalds@linux-foundation.org>	2023-06-27 16:54:21 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2023-06-27 16:54:21 -0700
commit	6e2332e0ab532eb01f1fde39080dbfa9bf65c4cf (patch)
tree	ebdbcd8403d9818b998b90c586c33879546fc6a8 /kernel/sched
parent	72dc6db7e3b692f46f3386b8dd5101d3f431adef (diff)
parent	81621430c81bb7965c3d5807039bc2b5b3ec87ca (diff)
download	linux-6e2332e0ab532eb01f1fde39080dbfa9bf65c4cf.tar.gz linux-6e2332e0ab532eb01f1fde39080dbfa9bf65c4cf.tar.bz2 linux-6e2332e0ab532eb01f1fde39080dbfa9bf65c4cf.zip