sched/fair: Don't free p->numa_faults with concurrent readers

commit 16d51a590a8ce3befb1308e0e7ab77f3b661af33 upstream. When going through execve(), zero out the NUMA fault statistics instead of freeing them. During execve, the task is reachable through procfs and the scheduler. A concurrent /proc/*/sched reader can read data from a freed ->numa_faults allocation (confirmed by KASAN) and write it back to userspace. I believe that it would also be possible for a use-after-free read to occur through a race between a NUMA fault and execve(): task_numa_fault() can lead to task_numa_compare(), which invokes task_weight() on the currently running task of a different CPU. Another way to fix this would be to make ->numa_faults RCU-managed or add extra locking, but it seems easier to wipe the NUMA fault statistics on execve. Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Petr Mladek <pmladek@suse.com> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Fixes: 82727018b0d3 ("sched/numa: Call task_numa_free() from do_execve()") Link: https://lkml.kernel.org/r/20190716152047.14424-1-jannh@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org> [bwh: Backported to 3.16: adjust filename, context] Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
author: Jann Horn <jannh@google.com> 2019-07-16 17:20:45 +0200
committer: Ben Hutchings <ben@decadent.org.uk> 2019-11-22 15:57:07 +0000
commit: e65d89d6e78cf1463e755a33e013bde15b894cf5 (patch)
tree: 2c1a842dde7f0a9c79a36e1e836d1b727646a0ec
parent: ed0f0cbd7d3c737328e40982cc8ec7f63477f828 (diff)
download: linux-stable-e65d89d6e78cf1463e755a33e013bde15b894cf5.tar.gz
linux-stable-e65d89d6e78cf1463e755a33e013bde15b894cf5.tar.bz2
linux-stable-e65d89d6e78cf1463e755a33e013bde15b894cf5.zip
4 files changed, 27 insertions, 11 deletions
diff --git a/fs/exec.c b/fs/exec.c
index 2acff9b648c0..077f85439264 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1589,7 +1589,7 @@ static int do_execve_common(struct filename *filename,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	acct_update_integrals(current);
-	task_numa_free(current);
+	task_numa_free(current, false);
 	free_bprm(bprm);
 	putname(filename);
 	if (displaced)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 754a7cb0699e..bcd8b1664301 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1671,7 +1671,7 @@ struct task_struct {
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
-extern void task_numa_free(struct task_struct *p);
+extern void task_numa_free(struct task_struct *p, bool final);
 extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
 					int src_nid, int dst_cpu);
 #else
@@ -1686,7 +1686,7 @@ static inline pid_t task_numa_group_id(struct task_struct *p)
 static inline void set_numabalancing_state(bool enabled)
 {
 }
-static inline void task_numa_free(struct task_struct *p)
+static inline void task_numa_free(struct task_struct *p, bool final)
 {
 }
 static inline bool should_numa_migrate_memory(struct task_struct *p,
diff --git a/kernel/fork.c b/kernel/fork.c
index 7dc86b50f925..2efc7a650c54 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -242,7 +242,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
-	task_numa_free(tsk);
+	task_numa_free(tsk, true);
 	security_task_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f967ff776e5b..7b882eed3e47 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1747,13 +1747,23 @@ no_join:
 	return;
 }
 
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
 {
 	struct numa_group *grp = p->numa_group;
-	void *numa_faults = p->numa_faults_memory;
+	unsigned long *numa_faults = p->numa_faults_memory;
 	unsigned long flags;
 	int i;
 
+	if (!numa_faults)
+		return;
+
 	if (grp) {
 		spin_lock_irqsave(&grp->lock, flags);
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -1767,11 +1777,17 @@ void task_numa_free(struct task_struct *p)
 		put_numa_group(grp);
 	}
 
-	p->numa_faults_memory = NULL;
-	p->numa_faults_buffer_memory = NULL;
-	p->numa_faults_cpu= NULL;
-	p->numa_faults_buffer_cpu = NULL;
-	kfree(numa_faults);
+	if (final) {
+		p->numa_faults_memory = NULL;
+		p->numa_faults_buffer_memory = NULL;
+		p->numa_faults_cpu = NULL;
+		p->numa_faults_buffer_cpu = NULL;
+		kfree(numa_faults);
+	} else {
+		p->total_numa_faults = 0;
+		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+			numa_faults[i] = 0;
+	}
 }
 
 /*
author	Jann Horn <jannh@google.com>	2019-07-16 17:20:45 +0200
committer	Ben Hutchings <ben@decadent.org.uk>	2019-11-22 15:57:07 +0000
commit	e65d89d6e78cf1463e755a33e013bde15b894cf5 (patch)
tree	2c1a842dde7f0a9c79a36e1e836d1b727646a0ec
parent	ed0f0cbd7d3c737328e40982cc8ec7f63477f828 (diff)
download	linux-stable-e65d89d6e78cf1463e755a33e013bde15b894cf5.tar.gz linux-stable-e65d89d6e78cf1463e755a33e013bde15b894cf5.tar.bz2 linux-stable-e65d89d6e78cf1463e755a33e013bde15b894cf5.zip