summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMina Almasry <almasrymina@google.com>2022-12-02 14:35:31 -0800
committerAndrew Morton <akpm@linux-foundation.org>2022-12-11 18:12:19 -0800
commit12a5d3955227b0d7e04fb793ccceeb2a1dd275c5 (patch)
tree968ecb9387bb93478c13ba34b8be7d669890da45 /mm
parent6b426d071419a40f61fe41fe1bd9e1b4fa5aeb37 (diff)
downloadlinux-12a5d3955227b0d7e04fb793ccceeb2a1dd275c5.tar.gz
linux-12a5d3955227b0d7e04fb793ccceeb2a1dd275c5.tar.bz2
linux-12a5d3955227b0d7e04fb793ccceeb2a1dd275c5.zip
mm: add nodes= arg to memory.reclaim
The nodes= arg instructs the kernel to only scan the given nodes for proactive reclaim. For example use cases, consider a 2 tier memory system: nodes 0,1 -> top tier nodes 2,3 -> second tier $ echo "1m nodes=0" > memory.reclaim This instructs the kernel to attempt to reclaim 1m memory from node 0. Since node 0 is a top tier node, demotion will be attempted first. This is useful to direct proactive reclaim to specific nodes that are under pressure. $ echo "1m nodes=2,3" > memory.reclaim This instructs the kernel to attempt to reclaim 1m memory in the second tier, since this tier of memory has no demotion targets the memory will be reclaimed. $ echo "1m nodes=0,1" > memory.reclaim Instructs the kernel to reclaim memory from the top tier nodes, which can be desirable according to the userspace policy if there is pressure on the top tiers. Since these nodes have demotion targets, the kernel will attempt demotion first. Since commit 3f1509c57b1b ("Revert "mm/vmscan: never demote for memcg reclaim""), the proactive reclaim interface memory.reclaim does both reclaim and demotion. Reclaim and demotion incur different latency costs to the jobs in the cgroup. Demoted memory would still be addressable by the userspace at a higher latency, but reclaimed memory would need to incur a pagefault. The 'nodes' arg is useful to allow the userspace to control demotion and reclaim independently according to its policy: if the memory.reclaim is called on a node with demotion targets, it will attempt demotion first; if it is called on a node without demotion targets, it will only attempt reclaim. Link: https://lkml.kernel.org/r/20221202223533.1785418-1-almasrymina@google.com Signed-off-by: Mina Almasry <almasrymina@google.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Shakeel Butt <shakeelb@google.com> Acked-by: Muchun Song <songmuchun@bytedance.com> Cc: Bagas Sanjaya <bagasdotme@gmail.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Tejun Heo <tj@kernel.org> Cc: Wei Xu <weixugc@google.com> Cc: Yang Shi <yang.shi@linux.alibaba.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: zefan li <lizefan.x@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c67
-rw-r--r--mm/vmscan.c4
2 files changed, 57 insertions, 14 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2c7a91689fef..ff65bc23be13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
+#include <linux/parser.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -2392,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
- MEMCG_RECLAIM_MAY_SWAP);
+ MEMCG_RECLAIM_MAY_SWAP,
+ NULL);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2683,7 +2685,8 @@ retry:
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options);
+ gfp_mask, reclaim_options,
+ NULL);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3503,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
+ NULL)) {
ret = -EBUSY;
break;
}
@@ -3614,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP))
+ MEMCG_RECLAIM_MAY_SWAP,
+ NULL))
nr_retries--;
}
@@ -6418,7 +6423,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+ NULL);
if (!reclaimed && !nr_retries--)
break;
@@ -6467,7 +6473,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+ NULL))
nr_reclaims--;
continue;
}
@@ -6590,21 +6597,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
+enum {
+ MEMORY_RECLAIM_NODES = 0,
+ MEMORY_RECLAIM_NULL,
+};
+
+static const match_table_t if_tokens = {
+ { MEMORY_RECLAIM_NODES, "nodes=%s" },
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
- unsigned int reclaim_options;
- int err;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_PROACTIVE;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
+ int token;
+ char value[256];
+ nodemask_t nodemask = NODE_MASK_ALL;
buf = strstrip(buf);
- err = page_counter_memparse(buf, "", &nr_to_reclaim);
- if (err)
- return err;
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ token = match_token(start, if_tokens, args);
+ match_strlcpy(value, args, sizeof(value));
+ switch (token) {
+ case MEMORY_RECLAIM_NODES:
+ if (nodelist_parse(value, nodemask) < 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed;
@@ -6621,7 +6661,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
- GFP_KERNEL, reclaim_options);
+ GFP_KERNEL, reclaim_options,
+ &nodemask);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1a59171c6695..2b42ac9ad755 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6758,7 +6758,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options)
+ unsigned int reclaim_options,
+ nodemask_t *nodemask)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -6773,6 +6774,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+ .nodemask = nodemask,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put