kernel: Backport MGLRU patch from 6.4

This fixes a bug where the reclaim path could occasionally have long tail latency. Signed-off-by: Kazuki Hashimoto <kazukih0205@gmail.com>
author: Kazuki Hashimoto <kazukih0205@gmail.com> 2023-05-24 23:58:03 +0900
committer: Christian Marangi <ansuelsmth@gmail.com> 2023-05-25 23:52:24 +0200
commit: ae8db3941ce90f4b00dfeb36a428425cdc53144d (patch)
tree: d7b585cbd97cac64b36f63812ef65737ce063229 /target/linux/generic/backport-6.1
parent: b28b8ed1f45de1ecff0a893345a0fcbf3fff3bbe (diff)
download: openwrt-ae8db3941ce90f4b00dfeb36a428425cdc53144d.tar.gz
openwrt-ae8db3941ce90f4b00dfeb36a428425cdc53144d.tar.bz2
openwrt-ae8db3941ce90f4b00dfeb36a428425cdc53144d.zip
1 files changed, 280 insertions, 0 deletions
diff --git a/target/linux/generic/backport-6.1/020-v6.4-19-mm-Multi-gen-LRU-remove-wait_event_killable.patch b/target/linux/generic/backport-6.1/020-v6.4-19-mm-Multi-gen-LRU-remove-wait_event_killable.patch
new file mode 100644
index 0000000000..4bf796c93d
--- /dev/null
+++ b/target/linux/generic/backport-6.1/020-v6.4-19-mm-Multi-gen-LRU-remove-wait_event_killable.patch
@@ -0,0 +1,280 @@
+From 418038c22452df38cde519cc8c662bb15139764a Mon Sep 17 00:00:00 2001
+From: Kalesh Singh <kaleshsingh@google.com>
+Date: Thu, 13 Apr 2023 14:43:26 -0700
+Subject: [PATCH 19/19] mm: Multi-gen LRU: remove wait_event_killable()
+
+Android 14 and later default to MGLRU [1] and field telemetry showed
+occasional long tail latency (>100ms) in the reclaim path.
+
+Tracing revealed priority inversion in the reclaim path.  In
+try_to_inc_max_seq(), when high priority tasks were blocked on
+wait_event_killable(), the preemption of the low priority task to call
+wake_up_all() caused those high priority tasks to wait longer than
+necessary.  In general, this problem is not different from others of its
+kind, e.g., one caused by mutex_lock().  However, it is specific to MGLRU
+because it introduced the new wait queue lruvec->mm_state.wait.
+
+The purpose of this new wait queue is to avoid the thundering herd
+problem.  If many direct reclaimers rush into try_to_inc_max_seq(), only
+one can succeed, i.e., the one to wake up the rest, and the rest who
+failed might cause premature OOM kills if they do not wait.  So far there
+is no evidence supporting this scenario, based on how often the wait has
+been hit.  And this begs the question how useful the wait queue is in
+practice.
+
+Based on Minchan's recommendation, which is in line with his commit
+6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path") and the
+rest of the MGLRU code which also uses trylock when possible, remove the
+wait queue.
+
+[1] https://android-review.googlesource.com/q/I7ed7fbfd6ef9ce10053347528125dd98c39e50bf
+
+Link: https://lkml.kernel.org/r/20230413214326.2147568-1-kaleshsingh@google.com
+Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
+Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
+Suggested-by: Minchan Kim <minchan@kernel.org>
+Reported-by: Wei Wang <wvw@google.com>
+Acked-by: Yu Zhao <yuzhao@google.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
+Cc: Oleksandr Natalenko <oleksandr@natalenko.name>
+Cc: Suleiman Souhlal <suleiman@google.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+ include/linux/mmzone.h |   8 +--
+ mm/vmscan.c            | 112 +++++++++++++++--------------------------
+ 2 files changed, 42 insertions(+), 78 deletions(-)
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 403c7461e7a70..d62a5accf1be4 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -453,18 +453,14 @@ enum {
+ struct lru_gen_mm_state {
+ 	/* set to max_seq after each iteration */
+ 	unsigned long seq;
+-	/* where the current iteration continues (inclusive) */
++	/* where the current iteration continues after */
+ 	struct list_head *head;
+-	/* where the last iteration ended (exclusive) */
++	/* where the last iteration ended before */
+ 	struct list_head *tail;
+-	/* to wait for the last page table walker to finish */
+-	struct wait_queue_head wait;
+ 	/* Bloom filters flip after each iteration */
+ 	unsigned long *filters[NR_BLOOM_FILTERS];
+ 	/* the mm stats for debugging */
+ 	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
+-	/* the number of concurrent page table walkers */
+-	int nr_walkers;
+ };
+ 
+ struct lru_gen_mm_walk {
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index f6ce7a1fd78a3..851758303dbf4 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -3371,18 +3371,13 @@ void lru_gen_del_mm(struct mm_struct *mm)
+ 		if (!lruvec)
+ 			continue;
+ 
+-		/* where the last iteration ended (exclusive) */
++		/* where the current iteration continues after */
++		if (lruvec->mm_state.head == &mm->lru_gen.list)
++			lruvec->mm_state.head = lruvec->mm_state.head->prev;
++
++		/* where the last iteration ended before */
+ 		if (lruvec->mm_state.tail == &mm->lru_gen.list)
+ 			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+-
+-		/* where the current iteration continues (inclusive) */
+-		if (lruvec->mm_state.head != &mm->lru_gen.list)
+-			continue;
+-
+-		lruvec->mm_state.head = lruvec->mm_state.head->next;
+-		/* the deletion ends the current iteration */
+-		if (lruvec->mm_state.head == &mm_list->fifo)
+-			WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
+ 	}
+ 
+ 	list_del_init(&mm->lru_gen.list);
+@@ -3478,68 +3473,54 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
+ 			    struct mm_struct **iter)
+ {
+ 	bool first = false;
+-	bool last = true;
++	bool last = false;
+ 	struct mm_struct *mm = NULL;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ 	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+ 
+ 	/*
+-	 * There are four interesting cases for this page table walker:
+-	 * 1. It tries to start a new iteration of mm_list with a stale max_seq;
+-	 *    there is nothing left to do.
+-	 * 2. It's the first of the current generation, and it needs to reset
+-	 *    the Bloom filter for the next generation.
+-	 * 3. It reaches the end of mm_list, and it needs to increment
+-	 *    mm_state->seq; the iteration is done.
+-	 * 4. It's the last of the current generation, and it needs to reset the
+-	 *    mm stats counters for the next generation.
++	 * mm_state->seq is incremented after each iteration of mm_list. There
++	 * are three interesting cases for this page table walker:
++	 * 1. It tries to start a new iteration with a stale max_seq: there is
++	 *    nothing left to do.
++	 * 2. It started the next iteration: it needs to reset the Bloom filter
++	 *    so that a fresh set of PTE tables can be recorded.
++	 * 3. It ended the current iteration: it needs to reset the mm stats
++	 *    counters and tell its caller to increment max_seq.
+ 	 */
+ 	spin_lock(&mm_list->lock);
+ 
+ 	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+-	VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
+-	VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
+ 
+-	if (walk->max_seq <= mm_state->seq) {
+-		if (!*iter)
+-			last = false;
++	if (walk->max_seq <= mm_state->seq)
+ 		goto done;
+-	}
+ 
+-	if (!mm_state->nr_walkers) {
+-		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
++	if (!mm_state->head)
++		mm_state->head = &mm_list->fifo;
+ 
+-		mm_state->head = mm_list->fifo.next;
++	if (mm_state->head == &mm_list->fifo)
+ 		first = true;
+-	}
+-
+-	while (!mm && mm_state->head != &mm_list->fifo) {
+-		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+ 
++	do {
+ 		mm_state->head = mm_state->head->next;
++		if (mm_state->head == &mm_list->fifo) {
++			WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
++			last = true;
++			break;
++		}
+ 
+ 		/* force scan for those added after the last iteration */
+-		if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
+-			mm_state->tail = mm_state->head;
++		if (!mm_state->tail || mm_state->tail == mm_state->head) {
++			mm_state->tail = mm_state->head->next;
+ 			walk->force_scan = true;
+ 		}
+ 
++		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+ 		if (should_skip_mm(mm, walk))
+ 			mm = NULL;
+-	}
+-
+-	if (mm_state->head == &mm_list->fifo)
+-		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
++	} while (!mm);
+ done:
+-	if (*iter && !mm)
+-		mm_state->nr_walkers--;
+-	if (!*iter && mm)
+-		mm_state->nr_walkers++;
+-
+-	if (mm_state->nr_walkers)
+-		last = false;
+-
+ 	if (*iter || last)
+ 		reset_mm_stats(lruvec, walk, last);
+ 
+@@ -3567,9 +3548,9 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+ 
+ 	VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+ 
+-	if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
+-		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+-
++	if (max_seq > mm_state->seq) {
++		mm_state->head = NULL;
++		mm_state->tail = NULL;
+ 		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ 		reset_mm_stats(lruvec, NULL, true);
+ 		success = true;
+@@ -4172,10 +4153,6 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
+ 
+ 		walk_pmd_range(&val, addr, next, args);
+ 
+-		/* a racy check to curtail the waiting time */
+-		if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
+-			return 1;
+-
+ 		if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
+ 			end = (addr | ~PUD_MASK) + 1;
+ 			goto done;
+@@ -4208,8 +4185,14 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
+ 	walk->next_addr = FIRST_USER_ADDRESS;
+ 
+ 	do {
++		DEFINE_MAX_SEQ(lruvec);
++
+ 		err = -EBUSY;
+ 
++		/* another thread might have called inc_max_seq() */
++		if (walk->max_seq != max_seq)
++			break;
++
+ 		/* folio_update_gen() requires stable folio_memcg() */
+ 		if (!mem_cgroup_trylock_pages(memcg))
+ 			break;
+@@ -4442,25 +4425,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ 		success = iterate_mm_list(lruvec, walk, &mm);
+ 		if (mm)
+ 			walk_mm(lruvec, mm, walk);
+-
+-		cond_resched();
+ 	} while (mm);
+ done:
+-	if (!success) {
+-		if (sc->priority <= DEF_PRIORITY - 2)
+-			wait_event_killable(lruvec->mm_state.wait,
+-					    max_seq < READ_ONCE(lrugen->max_seq));
+-		return false;
+-	}
++	if (success)
++		inc_max_seq(lruvec, can_swap, force_scan);
+ 
+-	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+-
+-	inc_max_seq(lruvec, can_swap, force_scan);
+-	/* either this sees any waiters or they will see updated max_seq */
+-	if (wq_has_sleeper(&lruvec->mm_state.wait))
+-		wake_up_all(&lruvec->mm_state.wait);
+-
+-	return true;
++	return success;
+ }
+ 
+ /******************************************************************************
+@@ -6105,7 +6075,6 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
+ 		INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
+ 
+ 	lruvec->mm_state.seq = MIN_NR_GENS;
+-	init_waitqueue_head(&lruvec->mm_state.wait);
+ }
+ 
+ #ifdef CONFIG_MEMCG
+@@ -6138,7 +6107,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+ 	for_each_node(nid) {
+ 		struct lruvec *lruvec = get_lruvec(memcg, nid);
+ 
+-		VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers);
+ 		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ 					   sizeof(lruvec->lrugen.nr_pages)));
+ 
+-- 
+2.40.1
+
author	Kazuki Hashimoto <kazukih0205@gmail.com>	2023-05-24 23:58:03 +0900
committer	Christian Marangi <ansuelsmth@gmail.com>	2023-05-25 23:52:24 +0200
commit	ae8db3941ce90f4b00dfeb36a428425cdc53144d (patch)
tree	d7b585cbd97cac64b36f63812ef65737ce063229 /target/linux/generic/backport-6.1
parent	b28b8ed1f45de1ecff0a893345a0fcbf3fff3bbe (diff)
download	openwrt-ae8db3941ce90f4b00dfeb36a428425cdc53144d.tar.gz openwrt-ae8db3941ce90f4b00dfeb36a428425cdc53144d.tar.bz2 openwrt-ae8db3941ce90f4b00dfeb36a428425cdc53144d.zip