summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2024-02-14 10:45:07 +0100
committerIngo Molnar <mingo@kernel.org>2024-02-14 10:45:07 +0100
commit03c11eb3b16dc0058589751dfd91f254be2be613 (patch)
treee5f2889212fec0bb0babdce9abd781ab487e246a /mm/vmscan.c
parentde8c6a352131f642b82474abe0cbb5dd26a7e081 (diff)
parent841c35169323cd833294798e58b9bf63fa4fa1de (diff)
downloadlinux-stable-03c11eb3b16dc0058589751dfd91f254be2be613.tar.gz
linux-stable-03c11eb3b16dc0058589751dfd91f254be2be613.tar.bz2
linux-stable-03c11eb3b16dc0058589751dfd91f254be2be613.zip
Merge tag 'v6.8-rc4' into x86/percpu, to resolve conflicts and refresh the branch
Conflicts: arch/x86/include/asm/percpu.h arch/x86/include/asm/text-patching.h Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c1115
1 files changed, 240 insertions, 875 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6f13394b112e..4f9c854ce6cc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,7 +35,6 @@
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
-#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -188,246 +187,7 @@ struct scan_control {
*/
int vm_swappiness = 60;
-LIST_HEAD(shrinker_list);
-DECLARE_RWSEM(shrinker_rwsem);
-
#ifdef CONFIG_MEMCG
-static int shrinker_nr_max;
-
-/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
-static inline int shrinker_map_size(int nr_items)
-{
- return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
-}
-
-static inline int shrinker_defer_size(int nr_items)
-{
- return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
-}
-
-static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
- int nid)
-{
- return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
- lockdep_is_held(&shrinker_rwsem));
-}
-
-static int expand_one_shrinker_info(struct mem_cgroup *memcg,
- int map_size, int defer_size,
- int old_map_size, int old_defer_size,
- int new_nr_max)
-{
- struct shrinker_info *new, *old;
- struct mem_cgroup_per_node *pn;
- int nid;
- int size = map_size + defer_size;
-
- for_each_node(nid) {
- pn = memcg->nodeinfo[nid];
- old = shrinker_info_protected(memcg, nid);
- /* Not yet online memcg */
- if (!old)
- return 0;
-
- /* Already expanded this shrinker_info */
- if (new_nr_max <= old->map_nr_max)
- continue;
-
- new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
- if (!new)
- return -ENOMEM;
-
- new->nr_deferred = (atomic_long_t *)(new + 1);
- new->map = (void *)new->nr_deferred + defer_size;
- new->map_nr_max = new_nr_max;
-
- /* map: set all old bits, clear all new bits */
- memset(new->map, (int)0xff, old_map_size);
- memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
- /* nr_deferred: copy old values, clear all new values */
- memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
- memset((void *)new->nr_deferred + old_defer_size, 0,
- defer_size - old_defer_size);
-
- rcu_assign_pointer(pn->shrinker_info, new);
- kvfree_rcu(old, rcu);
- }
-
- return 0;
-}
-
-void free_shrinker_info(struct mem_cgroup *memcg)
-{
- struct mem_cgroup_per_node *pn;
- struct shrinker_info *info;
- int nid;
-
- for_each_node(nid) {
- pn = memcg->nodeinfo[nid];
- info = rcu_dereference_protected(pn->shrinker_info, true);
- kvfree(info);
- rcu_assign_pointer(pn->shrinker_info, NULL);
- }
-}
-
-int alloc_shrinker_info(struct mem_cgroup *memcg)
-{
- struct shrinker_info *info;
- int nid, size, ret = 0;
- int map_size, defer_size = 0;
-
- down_write(&shrinker_rwsem);
- map_size = shrinker_map_size(shrinker_nr_max);
- defer_size = shrinker_defer_size(shrinker_nr_max);
- size = map_size + defer_size;
- for_each_node(nid) {
- info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
- if (!info) {
- free_shrinker_info(memcg);
- ret = -ENOMEM;
- break;
- }
- info->nr_deferred = (atomic_long_t *)(info + 1);
- info->map = (void *)info->nr_deferred + defer_size;
- info->map_nr_max = shrinker_nr_max;
- rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
- }
- up_write(&shrinker_rwsem);
-
- return ret;
-}
-
-static int expand_shrinker_info(int new_id)
-{
- int ret = 0;
- int new_nr_max = round_up(new_id + 1, BITS_PER_LONG);
- int map_size, defer_size = 0;
- int old_map_size, old_defer_size = 0;
- struct mem_cgroup *memcg;
-
- if (!root_mem_cgroup)
- goto out;
-
- lockdep_assert_held(&shrinker_rwsem);
-
- map_size = shrinker_map_size(new_nr_max);
- defer_size = shrinker_defer_size(new_nr_max);
- old_map_size = shrinker_map_size(shrinker_nr_max);
- old_defer_size = shrinker_defer_size(shrinker_nr_max);
-
- memcg = mem_cgroup_iter(NULL, NULL, NULL);
- do {
- ret = expand_one_shrinker_info(memcg, map_size, defer_size,
- old_map_size, old_defer_size,
- new_nr_max);
- if (ret) {
- mem_cgroup_iter_break(NULL, memcg);
- goto out;
- }
- } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
-out:
- if (!ret)
- shrinker_nr_max = new_nr_max;
-
- return ret;
-}
-
-void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
-{
- if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
- struct shrinker_info *info;
-
- rcu_read_lock();
- info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
- if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
- /* Pairs with smp mb in shrink_slab() */
- smp_mb__before_atomic();
- set_bit(shrinker_id, info->map);
- }
- rcu_read_unlock();
- }
-}
-
-static DEFINE_IDR(shrinker_idr);
-
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- int id, ret = -ENOMEM;
-
- if (mem_cgroup_disabled())
- return -ENOSYS;
-
- down_write(&shrinker_rwsem);
- /* This may call shrinker, so it must use down_read_trylock() */
- id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
- if (id < 0)
- goto unlock;
-
- if (id >= shrinker_nr_max) {
- if (expand_shrinker_info(id)) {
- idr_remove(&shrinker_idr, id);
- goto unlock;
- }
- }
- shrinker->id = id;
- ret = 0;
-unlock:
- up_write(&shrinker_rwsem);
- return ret;
-}
-
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
- int id = shrinker->id;
-
- BUG_ON(id < 0);
-
- lockdep_assert_held(&shrinker_rwsem);
-
- idr_remove(&shrinker_idr, id);
-}
-
-static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
- struct mem_cgroup *memcg)
-{
- struct shrinker_info *info;
-
- info = shrinker_info_protected(memcg, nid);
- return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
-}
-
-static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
- struct mem_cgroup *memcg)
-{
- struct shrinker_info *info;
-
- info = shrinker_info_protected(memcg, nid);
- return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
-}
-
-void reparent_shrinker_deferred(struct mem_cgroup *memcg)
-{
- int i, nid;
- long nr;
- struct mem_cgroup *parent;
- struct shrinker_info *child_info, *parent_info;
-
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
- /* Prevent from concurrent shrinker_info expand */
- down_read(&shrinker_rwsem);
- for_each_node(nid) {
- child_info = shrinker_info_protected(memcg, nid);
- parent_info = shrinker_info_protected(parent, nid);
- for (i = 0; i < child_info->map_nr_max; i++) {
- nr = atomic_long_read(&child_info->nr_deferred[i]);
- atomic_long_add(nr, &parent_info->nr_deferred[i]);
- }
- }
- up_read(&shrinker_rwsem);
-}
/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
static bool cgroup_reclaim(struct scan_control *sc)
@@ -468,27 +228,6 @@ static bool writeback_throttling_sane(struct scan_control *sc)
return false;
}
#else
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- return -ENOSYS;
-}
-
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
-}
-
-static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
- struct mem_cgroup *memcg)
-{
- return 0;
-}
-
-static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
- struct mem_cgroup *memcg)
-{
- return 0;
-}
-
static bool cgroup_reclaim(struct scan_control *sc)
{
return false;
@@ -557,39 +296,6 @@ static void flush_reclaim_state(struct scan_control *sc)
}
}
-static long xchg_nr_deferred(struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- int nid = sc->nid;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- nid = 0;
-
- if (sc->memcg &&
- (shrinker->flags & SHRINKER_MEMCG_AWARE))
- return xchg_nr_deferred_memcg(nid, shrinker,
- sc->memcg);
-
- return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
-}
-
-
-static long add_nr_deferred(long nr, struct shrinker *shrinker,
- struct shrink_control *sc)
-{
- int nid = sc->nid;
-
- if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
- nid = 0;
-
- if (sc->memcg &&
- (shrinker->flags & SHRINKER_MEMCG_AWARE))
- return add_nr_deferred_memcg(nr, nid, shrinker,
- sc->memcg);
-
- return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
-}
-
static bool can_demote(int nid, struct scan_control *sc)
{
if (!numa_demotion_enabled)
@@ -671,413 +377,6 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
return size;
}
-/*
- * Add a shrinker callback to be called from the vm.
- */
-static int __prealloc_shrinker(struct shrinker *shrinker)
-{
- unsigned int size;
- int err;
-
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- err = prealloc_memcg_shrinker(shrinker);
- if (err != -ENOSYS)
- return err;
-
- shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
- }
-
- size = sizeof(*shrinker->nr_deferred);
- if (shrinker->flags & SHRINKER_NUMA_AWARE)
- size *= nr_node_ids;
-
- shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
- if (!shrinker->nr_deferred)
- return -ENOMEM;
-
- return 0;
-}
-
-#ifdef CONFIG_SHRINKER_DEBUG
-int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
- va_list ap;
- int err;
-
- va_start(ap, fmt);
- shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
- va_end(ap);
- if (!shrinker->name)
- return -ENOMEM;
-
- err = __prealloc_shrinker(shrinker);
- if (err) {
- kfree_const(shrinker->name);
- shrinker->name = NULL;
- }
-
- return err;
-}
-#else
-int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
- return __prealloc_shrinker(shrinker);
-}
-#endif
-
-void free_prealloced_shrinker(struct shrinker *shrinker)
-{
-#ifdef CONFIG_SHRINKER_DEBUG
- kfree_const(shrinker->name);
- shrinker->name = NULL;
-#endif
- if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
- down_write(&shrinker_rwsem);
- unregister_memcg_shrinker(shrinker);
- up_write(&shrinker_rwsem);
- return;
- }
-
- kfree(shrinker->nr_deferred);
- shrinker->nr_deferred = NULL;
-}
-
-void register_shrinker_prepared(struct shrinker *shrinker)
-{
- down_write(&shrinker_rwsem);
- list_add_tail(&shrinker->list, &shrinker_list);
- shrinker->flags |= SHRINKER_REGISTERED;
- shrinker_debugfs_add(shrinker);
- up_write(&shrinker_rwsem);
-}
-
-static int __register_shrinker(struct shrinker *shrinker)
-{
- int err = __prealloc_shrinker(shrinker);
-
- if (err)
- return err;
- register_shrinker_prepared(shrinker);
- return 0;
-}
-
-#ifdef CONFIG_SHRINKER_DEBUG
-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
- va_list ap;
- int err;
-
- va_start(ap, fmt);
- shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
- va_end(ap);
- if (!shrinker->name)
- return -ENOMEM;
-
- err = __register_shrinker(shrinker);
- if (err) {
- kfree_const(shrinker->name);
- shrinker->name = NULL;
- }
- return err;
-}
-#else
-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
-{
- return __register_shrinker(shrinker);
-}
-#endif
-EXPORT_SYMBOL(register_shrinker);
-
-/*
- * Remove one
- */
-void unregister_shrinker(struct shrinker *shrinker)
-{
- struct dentry *debugfs_entry;
- int debugfs_id;
-
- if (!(shrinker->flags & SHRINKER_REGISTERED))
- return;
-
- down_write(&shrinker_rwsem);
- list_del(&shrinker->list);
- shrinker->flags &= ~SHRINKER_REGISTERED;
- if (shrinker->flags & SHRINKER_MEMCG_AWARE)
- unregister_memcg_shrinker(shrinker);
- debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
- up_write(&shrinker_rwsem);
-
- shrinker_debugfs_remove(debugfs_entry, debugfs_id);
-
- kfree(shrinker->nr_deferred);
- shrinker->nr_deferred = NULL;
-}
-EXPORT_SYMBOL(unregister_shrinker);
-
-/**
- * synchronize_shrinkers - Wait for all running shrinkers to complete.
- *
- * This is equivalent to calling unregister_shrink() and register_shrinker(),
- * but atomically and with less overhead. This is useful to guarantee that all
- * shrinker invocations have seen an update, before freeing memory, similar to
- * rcu.
- */
-void synchronize_shrinkers(void)
-{
- down_write(&shrinker_rwsem);
- up_write(&shrinker_rwsem);
-}
-EXPORT_SYMBOL(synchronize_shrinkers);
-
-#define SHRINK_BATCH 128
-
-static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
- struct shrinker *shrinker, int priority)
-{
- unsigned long freed = 0;
- unsigned long long delta;
- long total_scan;
- long freeable;
- long nr;
- long new_nr;
- long batch_size = shrinker->batch ? shrinker->batch
- : SHRINK_BATCH;
- long scanned = 0, next_deferred;
-
- freeable = shrinker->count_objects(shrinker, shrinkctl);
- if (freeable == 0 || freeable == SHRINK_EMPTY)
- return freeable;
-
- /*
- * copy the current shrinker scan count into a local variable
- * and zero it so that other concurrent shrinker invocations
- * don't also do this scanning work.
- */
- nr = xchg_nr_deferred(shrinker, shrinkctl);
-
- if (shrinker->seeks) {
- delta = freeable >> priority;
- delta *= 4;
- do_div(delta, shrinker->seeks);
- } else {
- /*
- * These objects don't require any IO to create. Trim
- * them aggressively under memory pressure to keep
- * them from causing refetches in the IO caches.
- */
- delta = freeable / 2;
- }
-
- total_scan = nr >> priority;
- total_scan += delta;
- total_scan = min(total_scan, (2 * freeable));
-
- trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
- freeable, delta, total_scan, priority);
-
- /*
- * Normally, we should not scan less than batch_size objects in one
- * pass to avoid too frequent shrinker calls, but if the slab has less
- * than batch_size objects in total and we are really tight on memory,
- * we will try to reclaim all available objects, otherwise we can end
- * up failing allocations although there are plenty of reclaimable
- * objects spread over several slabs with usage less than the
- * batch_size.
- *
- * We detect the "tight on memory" situations by looking at the total
- * number of objects we want to scan (total_scan). If it is greater
- * than the total number of objects on slab (freeable), we must be
- * scanning at high prio and therefore should try to reclaim as much as
- * possible.
- */
- while (total_scan >= batch_size ||
- total_scan >= freeable) {
- unsigned long ret;
- unsigned long nr_to_scan = min(batch_size, total_scan);
-
- shrinkctl->nr_to_scan = nr_to_scan;
- shrinkctl->nr_scanned = nr_to_scan;
- ret = shrinker->scan_objects(shrinker, shrinkctl);
- if (ret == SHRINK_STOP)
- break;
- freed += ret;
-
- count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
- total_scan -= shrinkctl->nr_scanned;
- scanned += shrinkctl->nr_scanned;
-
- cond_resched();
- }
-
- /*
- * The deferred work is increased by any new work (delta) that wasn't
- * done, decreased by old deferred work that was done now.
- *
- * And it is capped to two times of the freeable items.
- */
- next_deferred = max_t(long, (nr + delta - scanned), 0);
- next_deferred = min(next_deferred, (2 * freeable));
-
- /*
- * move the unused scan count back into the shrinker in a
- * manner that handles concurrent updates.
- */
- new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
-
- trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
- return freed;
-}
-
-#ifdef CONFIG_MEMCG
-static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
- struct mem_cgroup *memcg, int priority)
-{
- struct shrinker_info *info;
- unsigned long ret, freed = 0;
- int i;
-
- if (!mem_cgroup_online(memcg))
- return 0;
-
- if (!down_read_trylock(&shrinker_rwsem))
- return 0;
-
- info = shrinker_info_protected(memcg, nid);
- if (unlikely(!info))
- goto unlock;
-
- for_each_set_bit(i, info->map, info->map_nr_max) {
- struct shrink_control sc = {
- .gfp_mask = gfp_mask,
- .nid = nid,
- .memcg = memcg,
- };
- struct shrinker *shrinker;
-
- shrinker = idr_find(&shrinker_idr, i);
- if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
- if (!shrinker)
- clear_bit(i, info->map);
- continue;
- }
-
- /* Call non-slab shrinkers even though kmem is disabled */
- if (!memcg_kmem_online() &&
- !(shrinker->flags & SHRINKER_NONSLAB))
- continue;
-
- ret = do_shrink_slab(&sc, shrinker, priority);
- if (ret == SHRINK_EMPTY) {
- clear_bit(i, info->map);
- /*
- * After the shrinker reported that it had no objects to
- * free, but before we cleared the corresponding bit in
- * the memcg shrinker map, a new object might have been
- * added. To make sure, we have the bit set in this
- * case, we invoke the shrinker one more time and reset
- * the bit if it reports that it is not empty anymore.
- * The memory barrier here pairs with the barrier in
- * set_shrinker_bit():
- *
- * list_lru_add() shrink_slab_memcg()
- * list_add_tail() clear_bit()
- * <MB> <MB>
- * set_bit() do_shrink_slab()
- */
- smp_mb__after_atomic();
- ret = do_shrink_slab(&sc, shrinker, priority);
- if (ret == SHRINK_EMPTY)
- ret = 0;
- else
- set_shrinker_bit(memcg, nid, i);
- }
- freed += ret;
-
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
- }
-unlock:
- up_read(&shrinker_rwsem);
- return freed;
-}
-#else /* CONFIG_MEMCG */
-static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
- struct mem_cgroup *memcg, int priority)
-{
- return 0;
-}
-#endif /* CONFIG_MEMCG */
-
-/**
- * shrink_slab - shrink slab caches
- * @gfp_mask: allocation context
- * @nid: node whose slab caches to target
- * @memcg: memory cgroup whose slab caches to target
- * @priority: the reclaim priority
- *
- * Call the shrink functions to age shrinkable caches.
- *
- * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
- * unaware shrinkers will receive a node id of 0 instead.
- *
- * @memcg specifies the memory cgroup to target. Unaware shrinkers
- * are called only if it is the root cgroup.
- *
- * @priority is sc->priority, we take the number of objects and >> by priority
- * in order to get the scan target.
- *
- * Returns the number of reclaimed slab objects.
- */
-static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
- struct mem_cgroup *memcg,
- int priority)
-{
- unsigned long ret, freed = 0;
- struct shrinker *shrinker;
-
- /*
- * The root memcg might be allocated even though memcg is disabled
- * via "cgroup_disable=memory" boot parameter. This could make
- * mem_cgroup_is_root() return false, then just run memcg slab
- * shrink, but skip global shrink. This may result in premature
- * oom.
- */
- if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
- return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
-
- if (!down_read_trylock(&shrinker_rwsem))
- goto out;
-
- list_for_each_entry(shrinker, &shrinker_list, list) {
- struct shrink_control sc = {
- .gfp_mask = gfp_mask,
- .nid = nid,
- .memcg = memcg,
- };
-
- ret = do_shrink_slab(&sc, shrinker, priority);
- if (ret == SHRINK_EMPTY)
- ret = 0;
- freed += ret;
- /*
- * Bail out if someone want to register a new shrinker to
- * prevent the registration from being stalled for long periods
- * by parallel ongoing shrinking.
- */
- if (rwsem_is_contended(&shrinker_rwsem)) {
- freed = freed ? : 1;
- break;
- }
- }
-
- up_read(&shrinker_rwsem);
-out:
- cond_resched();
- return freed;
-}
-
static unsigned long drop_slab_node(int nid)
{
unsigned long freed = 0;
@@ -1112,10 +411,10 @@ static int reclaimer_offset(void)
{
BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
- BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
- PGSCAN_DIRECT - PGSCAN_KSWAPD);
BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGSCAN_DIRECT - PGSCAN_KSWAPD);
BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
@@ -1678,7 +977,8 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
- __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
+ mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ nr_succeeded);
return nr_succeeded;
}
@@ -1915,6 +1215,7 @@ retry:
folio_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(folio))
@@ -2271,7 +1572,7 @@ static bool skip_cma(struct folio *folio, struct scan_control *sc)
{
return !current_is_kswapd() &&
gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
- get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
+ folio_migratetype(folio) == MIGRATE_CMA;
}
#else
static bool skip_cma(struct folio *folio, struct scan_control *sc)
@@ -2389,8 +1690,7 @@ move:
}
*nr_scanned = total_scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
- total_scan, skipped, nr_taken,
- sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
+ total_scan, skipped, nr_taken, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
@@ -2909,7 +2209,7 @@ enum scan_balance {
SCAN_FILE,
};
-static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
{
unsigned long file;
struct lruvec *target_lruvec;
@@ -2923,7 +2223,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
* Flush the memory cgroup stats, so that we read accurate per-memcg
* lruvec stats for heuristics.
*/
- mem_cgroup_flush_stats();
+ mem_cgroup_flush_stats(sc->target_mem_cgroup);
/*
* Determine the scan balance between anon and file LRUs.
@@ -3368,13 +2668,14 @@ static void get_item_key(void *item, int *key)
key[1] = hash >> BLOOM_FILTER_SHIFT;
}
-static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
+ void *item)
{
int key[2];
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
- filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ filter = READ_ONCE(mm_state->filters[gen]);
if (!filter)
return true;
@@ -3383,13 +2684,14 @@ static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *it
return test_bit(key[0], filter) && test_bit(key[1], filter);
}
-static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
+ void *item)
{
int key[2];
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
- filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ filter = READ_ONCE(mm_state->filters[gen]);
if (!filter)
return;
@@ -3401,12 +2703,12 @@ static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *
set_bit(key[1], filter);
}
-static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq)
{
unsigned long *filter;
int gen = filter_gen_from_seq(seq);
- filter = lruvec->mm_state.filters[gen];
+ filter = mm_state->filters[gen];
if (filter) {
bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
return;
@@ -3414,13 +2716,15 @@ static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
- WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+ WRITE_ONCE(mm_state->filters[gen], filter);
}
/******************************************************************************
* mm_struct list
******************************************************************************/
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
{
static struct lru_gen_mm_list mm_list = {
@@ -3437,6 +2741,29 @@ static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
return &mm_list;
}
+static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
+{
+ return &lruvec->mm_state;
+}
+
+static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
+{
+ int key;
+ struct mm_struct *mm;
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
+
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+ key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
+
+ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
+ return NULL;
+
+ clear_bit(key, &mm->lru_gen.bitmap);
+
+ return mmget_not_zero(mm) ? mm : NULL;
+}
+
void lru_gen_add_mm(struct mm_struct *mm)
{
int nid;
@@ -3452,10 +2779,11 @@ void lru_gen_add_mm(struct mm_struct *mm)
for_each_node_state(nid, N_MEMORY) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
/* the first addition since the last iteration */
- if (lruvec->mm_state.tail == &mm_list->fifo)
- lruvec->mm_state.tail = &mm->lru_gen.list;
+ if (mm_state->tail == &mm_list->fifo)
+ mm_state->tail = &mm->lru_gen.list;
}
list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
@@ -3481,14 +2809,15 @@ void lru_gen_del_mm(struct mm_struct *mm)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
/* where the current iteration continues after */
- if (lruvec->mm_state.head == &mm->lru_gen.list)
- lruvec->mm_state.head = lruvec->mm_state.head->prev;
+ if (mm_state->head == &mm->lru_gen.list)
+ mm_state->head = mm_state->head->prev;
/* where the last iteration ended before */
- if (lruvec->mm_state.tail == &mm->lru_gen.list)
- lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+ if (mm_state->tail == &mm->lru_gen.list)
+ mm_state->tail = mm_state->tail->next;
}
list_del_init(&mm->lru_gen.list);
@@ -3531,10 +2860,30 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
}
#endif
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+ return NULL;
+}
+
+static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
+{
+ return NULL;
+}
+
+static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
+{
+ return NULL;
+}
+
+#endif
+
static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
{
int i;
int hist;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
@@ -3542,44 +2891,20 @@ static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
hist = lru_hist_from_seq(walk->max_seq);
for (i = 0; i < NR_MM_STATS; i++) {
- WRITE_ONCE(lruvec->mm_state.stats[hist][i],
- lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+ WRITE_ONCE(mm_state->stats[hist][i],
+ mm_state->stats[hist][i] + walk->mm_stats[i]);
walk->mm_stats[i] = 0;
}
}
if (NR_HIST_GENS > 1 && last) {
- hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+ hist = lru_hist_from_seq(mm_state->seq + 1);
for (i = 0; i < NR_MM_STATS; i++)
- WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
+ WRITE_ONCE(mm_state->stats[hist][i], 0);
}
}
-static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
-{
- int type;
- unsigned long size = 0;
- struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
-
- if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
- return true;
-
- clear_bit(key, &mm->lru_gen.bitmap);
-
- for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
- size += type ? get_mm_counter(mm, MM_FILEPAGES) :
- get_mm_counter(mm, MM_ANONPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
- }
-
- if (size < MIN_LRU_BATCH)
- return true;
-
- return !mmget_not_zero(mm);
-}
-
static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
struct mm_struct **iter)
{
@@ -3588,7 +2913,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
struct mm_struct *mm = NULL;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
/*
* mm_state->seq is incremented after each iteration of mm_list. There
@@ -3626,11 +2951,7 @@ static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
mm_state->tail = mm_state->head->next;
walk->force_scan = true;
}
-
- mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
- if (should_skip_mm(mm, walk))
- mm = NULL;
- } while (!mm);
+ } while (!(mm = get_next_mm(walk)));
done:
if (*iter || last)
reset_mm_stats(lruvec, walk, last);
@@ -3638,7 +2959,7 @@ done:
spin_unlock(&mm_list->lock);
if (mm && first)
- reset_bloom_filter(lruvec, walk->max_seq + 1);
+ reset_bloom_filter(mm_state, walk->max_seq + 1);
if (*iter)
mmput_async(*iter);
@@ -3653,7 +2974,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
bool success = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
spin_lock(&mm_list->lock);
@@ -3949,7 +3270,6 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
return pfn;
}
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
{
unsigned long pfn = pmd_pfn(pmd);
@@ -3967,7 +3287,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
return pfn;
}
-#endif
static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
struct pglist_data *pgdat, bool can_swap)
@@ -4070,7 +3389,6 @@ restart:
return suitable_to_scan(total, young);
}
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
{
@@ -4148,12 +3466,6 @@ next:
done:
*first = -1;
}
-#else
-static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
- struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
-{
-}
-#endif
static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
struct mm_walk *args)
@@ -4166,6 +3478,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
unsigned long first = -1;
struct lru_gen_mm_walk *walk = args->private;
+ struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
VM_WARN_ON_ONCE(pud_leaf(*pud));
@@ -4188,7 +3501,6 @@ restart:
continue;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(val)) {
unsigned long pfn = pmd_pfn(val);
struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
@@ -4207,7 +3519,7 @@ restart:
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
continue;
}
-#endif
+
walk->mm_stats[MM_NONLEAF_TOTAL]++;
if (should_clear_pmd_young()) {
@@ -4217,7 +3529,7 @@ restart:
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
}
- if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+ if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i))
continue;
walk->mm_stats[MM_NONLEAF_FOUND]++;
@@ -4228,7 +3540,7 @@ restart:
walk->mm_stats[MM_NONLEAF_ADDED]++;
/* carry over to the next generation */
- update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+ update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i);
}
walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
@@ -4435,16 +3747,25 @@ next:
return success;
}
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ bool can_swap, bool force_scan)
{
+ bool success;
int prev, next;
int type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
restart:
+ if (max_seq < READ_ONCE(lrugen->max_seq))
+ return false;
+
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ success = max_seq == lrugen->max_seq;
+ if (!success)
+ goto unlock;
+
for (type = ANON_AND_FILE - 1; type >= 0; type--) {
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
@@ -4488,8 +3809,10 @@ restart:
WRITE_ONCE(lrugen->timestamps[next], jiffies);
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
-
+unlock:
spin_unlock_irq(&lruvec->lru_lock);
+
+ return success;
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
@@ -4499,14 +3822,16 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
struct lru_gen_mm_walk *walk;
struct mm_struct *mm = NULL;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+ if (!mm_state)
+ return inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+
/* see the comment in iterate_mm_list() */
- if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
- success = false;
- goto done;
- }
+ if (max_seq <= READ_ONCE(mm_state->seq))
+ return false;
/*
* If the hardware doesn't automatically set the accessed bit, fallback
@@ -4536,8 +3861,10 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
walk_mm(lruvec, mm, walk);
} while (mm);
done:
- if (success)
- inc_max_seq(lruvec, can_swap, force_scan);
+ if (success) {
+ success = inc_max_seq(lruvec, max_seq, can_swap, force_scan);
+ WARN_ON_ONCE(!success);
+ }
return success;
}
@@ -4656,11 +3983,13 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
int young = 0;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
+ struct vm_area_struct *vma = pvmw->vma;
struct folio *folio = pfn_folio(pvmw->pfn);
bool can_swap = !folio_is_file_lru(folio);
struct mem_cgroup *memcg = folio_memcg(folio);
struct pglist_data *pgdat = folio_pgdat(folio);
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
DEFINE_MAX_SEQ(lruvec);
int old_gen, new_gen = lru_gen_from_seq(max_seq);
@@ -4670,11 +3999,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (spin_is_contended(pvmw->ptl))
return;
+ /* exclude special VMAs containing anon pages from COW */
+ if (vma->vm_flags & VM_SPECIAL)
+ return;
+
/* avoid taking the LRU lock under the PTL when possible */
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
- start = max(addr & PMD_MASK, pvmw->vma->vm_start);
- end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+ start = max(addr & PMD_MASK, vma->vm_start);
+ end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
@@ -4699,7 +4032,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
unsigned long pfn;
pte_t ptent = ptep_get(pte + i);
- pfn = get_pte_pfn(ptent, pvmw->vma, addr);
+ pfn = get_pte_pfn(ptent, vma, addr);
if (pfn == -1)
continue;
@@ -4710,7 +4043,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
if (!folio)
continue;
- if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ if (!ptep_test_and_clear_young(vma, addr, pte + i))
VM_WARN_ON_ONCE(true);
young++;
@@ -4739,8 +4072,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
mem_cgroup_unlock_pages();
/* feedback from rmap walkers to page table walkers */
- if (suitable_to_scan(i, young))
- update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+ if (mm_state && suitable_to_scan(i, young))
+ update_bloom_filter(mm_state, max_seq, pvmw->pmd);
}
/******************************************************************************
@@ -4756,13 +4089,6 @@ enum {
MEMCG_LRU_YOUNG,
};
-#ifdef CONFIG_MEMCG
-
-static int lru_gen_memcg_seg(struct lruvec *lruvec)
-{
- return READ_ONCE(lruvec->lrugen.seg);
-}
-
static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
{
int seg;
@@ -4790,6 +4116,9 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
else
VM_WARN_ON_ONCE(true);
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
+ WRITE_ONCE(lruvec->lrugen.gen, new);
+
hlist_nulls_del_rcu(&lruvec->lrugen.list);
if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
@@ -4800,15 +4129,14 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
pgdat->memcg_lru.nr_memcgs[old]--;
pgdat->memcg_lru.nr_memcgs[new]++;
- lruvec->lrugen.gen = new;
- WRITE_ONCE(lruvec->lrugen.seg, seg);
-
if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
}
+#ifdef CONFIG_MEMCG
+
void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
int gen;
@@ -4825,11 +4153,11 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
gen = get_memcg_gen(pgdat->memcg_lru.seq);
+ lruvec->lrugen.gen = gen;
+
hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
pgdat->memcg_lru.nr_memcgs[gen]++;
- lruvec->lrugen.gen = gen;
-
spin_unlock_irq(&pgdat->memcg_lru.lock);
}
}
@@ -4876,18 +4204,11 @@ void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
struct lruvec *lruvec = get_lruvec(memcg, nid);
/* see the comment on MEMCG_NR_GENS */
- if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
+ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
}
-#else /* !CONFIG_MEMCG */
-
-static int lru_gen_memcg_seg(struct lruvec *lruvec)
-{
- return 0;
-}
-
-#endif
+#endif /* CONFIG_MEMCG */
/******************************************************************************
* the eviction
@@ -4933,7 +4254,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* protected */
- if (tier > tier_idx) {
+ if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
gen = folio_inc_gen(lruvec, folio, false);
@@ -5005,6 +4326,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int sorted = 0;
int scanned = 0;
int isolated = 0;
+ int skipped = 0;
int remaining = MAX_LRU_BATCH;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
@@ -5018,7 +4340,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
for (i = MAX_NR_ZONES; i > 0; i--) {
LIST_HEAD(moved);
- int skipped = 0;
+ int skipped_zone = 0;
int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
struct list_head *head = &lrugen->folios[gen][type][zone];
@@ -5040,16 +4362,17 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
isolated += delta;
} else {
list_move(&folio->lru, &moved);
- skipped += delta;
+ skipped_zone += delta;
}
- if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
+ if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH)
break;
}
- if (skipped) {
+ if (skipped_zone) {
list_splice(&moved, head);
- __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone);
+ skipped += skipped_zone;
}
if (!remaining || isolated >= MIN_LRU_BATCH)
@@ -5064,6 +4387,9 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
__count_memcg_events(memcg, item, isolated);
__count_memcg_events(memcg, PGREFILL, sorted);
__count_vm_events(PGSCAN_ANON + type, isolated);
+ trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
+ scanned, skipped, isolated,
+ type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
/*
* There might not be eligible folios due to reclaim_idx. Check the
@@ -5194,6 +4520,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
sc->nr_reclaimed += reclaimed;
+ trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+ scanned, reclaimed, &stat, sc->priority,
+ type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
list_for_each_entry_safe_reverse(folio, next, &list, lru) {
if (!folio_evictable(folio)) {
@@ -5291,7 +4620,12 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
}
/* try to scrape all its memory if this memcg was deleted */
- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+ if (!mem_cgroup_online(memcg)) {
+ *nr_to_scan = total;
+ return false;
+ }
+
+ *nr_to_scan = total >> sc->priority;
/*
* The aging tries to be lazy to reduce the overhead, while the eviction
@@ -5328,7 +4662,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
DEFINE_MAX_SEQ(lruvec);
if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
- return 0;
+ return -1;
if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
return nr_to_scan;
@@ -5341,20 +4675,41 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool
return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
}
-static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
{
+ int i;
+ enum zone_watermarks mark;
+
/* don't abort memcg reclaim to ensure fairness */
if (!root_reclaim(sc))
- return -1;
+ return false;
+
+ if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
+ return true;
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (!current_is_kswapd() || sc->order)
+ return false;
- return max(sc->nr_to_reclaim, compact_gap(sc->order));
+ mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
+ WMARK_PROMO : WMARK_HIGH;
+
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+ unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;
+
+ if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0))
+ return false;
+ }
+
+ /* kswapd should abort if all eligible zones are safe */
+ return true;
}
static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
long nr_to_scan;
unsigned long scanned = 0;
- unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
int swappiness = get_swappiness(lruvec, sc);
/* clean file folios are more likely to exist */
@@ -5376,13 +4731,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
if (scanned >= nr_to_scan)
break;
- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (should_abort_scan(lruvec, sc))
break;
cond_resched();
}
- /* whether try_to_inc_max_seq() was successful */
+ /* whether this lruvec should be rotated */
return nr_to_scan < 0;
}
@@ -5391,14 +4746,9 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
bool success;
unsigned long scanned = sc->nr_scanned;
unsigned long reclaimed = sc->nr_reclaimed;
- int seg = lru_gen_memcg_seg(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- /* see the comment on MEMCG_NR_GENS */
- if (!lruvec_is_sizable(lruvec, sc))
- return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
-
mem_cgroup_calculate_protection(NULL, memcg);
if (mem_cgroup_below_min(NULL, memcg))
@@ -5406,7 +4756,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
if (mem_cgroup_below_low(NULL, memcg)) {
/* see the comment on MEMCG_NR_GENS */
- if (seg != MEMCG_LRU_TAIL)
+ if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
return MEMCG_LRU_TAIL;
memcg_memory_event(memcg, MEMCG_LOW);
@@ -5422,10 +4772,16 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
flush_reclaim_state(sc);
- return success ? MEMCG_LRU_YOUNG : 0;
-}
+ if (success && mem_cgroup_online(memcg))
+ return MEMCG_LRU_YOUNG;
-#ifdef CONFIG_MEMCG
+ if (!success && lruvec_is_sizable(lruvec, sc))
+ return 0;
+
+ /* one retry if offlined or too small */
+ return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
+ MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
+}
static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
{
@@ -5436,14 +4792,13 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
struct lruvec *lruvec;
struct lru_gen_folio *lrugen;
struct mem_cgroup *memcg;
- const struct hlist_nulls_node *pos;
- unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ struct hlist_nulls_node *pos;
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
restart:
op = 0;
memcg = NULL;
- gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
rcu_read_lock();
@@ -5454,6 +4809,10 @@ restart:
}
mem_cgroup_put(memcg);
+ memcg = NULL;
+
+ if (gen != READ_ONCE(lrugen->gen))
+ continue;
lruvec = container_of(lrugen, struct lruvec, lrugen);
memcg = lruvec_memcg(lruvec);
@@ -5470,7 +4829,7 @@ restart:
rcu_read_lock();
- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (should_abort_scan(lruvec, sc))
break;
}
@@ -5481,7 +4840,7 @@ restart:
mem_cgroup_put(memcg);
- if (sc->nr_reclaimed >= nr_to_reclaim)
+ if (!is_a_nulls(pos))
return;
/* restart if raced with lru_gen_rotate_memcg() */
@@ -5515,20 +4874,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug);
}
-#else /* !CONFIG_MEMCG */
-
-static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
-{
- BUILD_BUG();
-}
-
-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-{
- BUILD_BUG();
-}
-
-#endif
-
static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
{
int priority;
@@ -5538,16 +4883,14 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
/*
- * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
- * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
- * estimated reclaimed_to_scanned_ratio = inactive / total.
+ * Determine the initial priority based on
+ * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
+ * where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
if (get_swappiness(lruvec, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
- reclaimable /= MEMCG_NR_GENS;
-
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
@@ -5878,6 +5221,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
int type, tier;
int hist = lru_hist_from_seq(seq);
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
@@ -5903,6 +5247,9 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_putc(m, '\n');
}
+ if (!mm_state)
+ return;
+
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
const char *s = " ";
@@ -5910,10 +5257,10 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
if (seq == max_seq && NR_HIST_GENS == 1) {
s = "LOYNFA";
- n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
s = "loynfa";
- n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ n = READ_ONCE(mm_state->stats[hist][i]);
}
seq_printf(m, " %10lu%c", n, s[i]);
@@ -6177,11 +5524,24 @@ static const struct file_operations lru_gen_ro_fops = {
* initialization
******************************************************************************/
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+ int i, j;
+
+ spin_lock_init(&pgdat->memcg_lru.lock);
+
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
+ for (j = 0; j < MEMCG_NR_BINS; j++)
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+ }
+}
+
void lru_gen_init_lruvec(struct lruvec *lruvec)
{
int i;
int gen, type, zone;
struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
@@ -6192,47 +5552,46 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
- lruvec->mm_state.seq = MIN_NR_GENS;
+ if (mm_state)
+ mm_state->seq = MIN_NR_GENS;
}
#ifdef CONFIG_MEMCG
-void lru_gen_init_pgdat(struct pglist_data *pgdat)
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
- int i, j;
-
- spin_lock_init(&pgdat->memcg_lru.lock);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- for (i = 0; i < MEMCG_NR_GENS; i++) {
- for (j = 0; j < MEMCG_NR_BINS; j++)
- INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
- }
-}
+ if (!mm_list)
+ return;
-void lru_gen_init_memcg(struct mem_cgroup *memcg)
-{
- INIT_LIST_HEAD(&memcg->mm_list.fifo);
- spin_lock_init(&memcg->mm_list.lock);
+ INIT_LIST_HEAD(&mm_list->fifo);
+ spin_lock_init(&mm_list->lock);
}
void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
int i;
int nid;
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
+ VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
+ struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
sizeof(lruvec->lrugen.nr_pages)));
lruvec->lrugen.list.next = LIST_POISON1;
+ if (!mm_state)
+ continue;
+
for (i = 0; i < NR_BLOOM_FILTERS; i++) {
- bitmap_free(lruvec->mm_state.filters[i]);
- lruvec->mm_state.filters[i] = NULL;
+ bitmap_free(mm_state->filters[i]);
+ mm_state->filters[i] = NULL;
}
}
}
@@ -6258,14 +5617,17 @@ late_initcall(init_lru_gen);
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
+ BUILD_BUG();
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
+ BUILD_BUG();
}
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
{
+ BUILD_BUG();
}
#endif /* CONFIG_LRU_GEN */
@@ -6535,7 +5897,7 @@ again:
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- prepare_scan_count(pgdat, sc);
+ prepare_scan_control(pgdat, sc);
shrink_node_memcgs(pgdat, sc);
@@ -7058,7 +6420,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
* scan_control uses s8 fields for order, priority, and reclaim_idx.
* Confirm they are large enough for max values.
*/
- BUILD_BUG_ON(MAX_ORDER >= S8_MAX);
+ BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
@@ -7892,8 +7254,9 @@ void __meminit kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
+ pr_err("Failed to start kswapd on node %d,ret=%ld\n",
+ nid, PTR_ERR(pgdat->kswapd));
BUG_ON(system_state < SYSTEM_RUNNING);
- pr_err("Failed to start kswapd on node %d\n", nid);
pgdat->kswapd = NULL;
}
}
@@ -8026,6 +7389,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
cond_resched();
psi_memstall_enter(&pflags);
+ delayacct_freepages_start();
fs_reclaim_acquire(sc.gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
@@ -8048,6 +7412,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
psi_memstall_leave(&pflags);
+ delayacct_freepages_end();
trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);