summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2019-05-14 15:47:06 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-14 19:52:53 -0700
commit205b20cc5a99cdf197c32f4dbee2b09c699477f0 (patch)
treefb3f160fd9e861845bccbc93bb25d068af060533 /mm
parent6a024330650e24556b8a18cc654ad00cfecf6c6c (diff)
downloadlinux-205b20cc5a99cdf197c32f4dbee2b09c699477f0.tar.gz
linux-205b20cc5a99cdf197c32f4dbee2b09c699477f0.tar.bz2
linux-205b20cc5a99cdf197c32f4dbee2b09c699477f0.zip
mm: memcontrol: make cgroup stats and events query API explicitly local
Patch series "mm: memcontrol: memory.stat cost & correctness". The cgroup memory.stat file holds recursive statistics for the entire subtree. The current implementation does this tree walk on-demand whenever the file is read. This is giving us problems in production. 1. The cost of aggregating the statistics on-demand is high. A lot of system service cgroups are mostly idle and their stats don't change between reads, yet we always have to check them. There are also always some lazily-dying cgroups sitting around that are pinned by a handful of remaining page cache; the same applies to them. In an application that periodically monitors memory.stat in our fleet, we have seen the aggregation consume up to 5% CPU time. 2. When cgroups die and disappear from the cgroup tree, so do their accumulated vm events. The result is that the event counters at higher-level cgroups can go backwards and confuse some of our automation, let alone people looking at the graphs over time. To address both issues, this patch series changes the stat implementation to spill counts upwards when the counters change. The upward spilling is batched using the existing per-cpu cache. In a sparse file stress test with 5 level cgroup nesting, the additional cost of the flushing was negligible (a little under 1% of CPU at 100% CPU utilization, compared to the 5% of reading memory.stat during regular operation). This patch (of 4): memcg_page_state(), lruvec_page_state(), memcg_sum_events() are currently returning the state of the local memcg or lruvec, not the recursive state. In practice there is a demand for both versions, although the callers that want the recursive counts currently sum them up by hand. Per default, cgroups are considered recursive entities and generally we expect more users of the recursive counters, with the local counts being special cases. To reflect that in the name, add a _local suffix to the current implementations. The following patch will re-incarnate these functions with recursive semantics, but with an O(1) implementation. [hannes@cmpxchg.org: fix bisection hole] Link: http://lkml.kernel.org/r/20190417160347.GC23013@cmpxchg.org Link: http://lkml.kernel.org/r/20190412151507.2769-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Shakeel Butt <shakeelb@google.com> Reviewed-by: Roman Gushchin <guro@fb.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c40
-rw-r--r--mm/vmscan.c6
-rw-r--r--mm/workingset.c7
3 files changed, 28 insertions, 25 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 52a47f4e28c7..9e95bf221feb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -687,8 +687,8 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- int event)
+static unsigned long memcg_events_local(struct mem_cgroup *memcg,
+ int event)
{
return atomic_long_read(&memcg->vmevents[event]);
}
@@ -1325,12 +1325,14 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%luKB", memcg1_stat_names[i],
- K(memcg_page_state(iter, memcg1_stats[i])));
+ K(memcg_page_state_local(iter,
+ memcg1_stats[i])));
}
for (i = 0; i < NR_LRU_LISTS; i++)
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(memcg_page_state(iter, NR_LRU_BASE + i)));
+ K(memcg_page_state_local(iter,
+ NR_LRU_BASE + i)));
pr_cont("\n");
}
@@ -1396,13 +1398,13 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
{
struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
- lruvec_page_state(lruvec, NR_ACTIVE_FILE))
+ if (lruvec_page_state_local(lruvec, NR_INACTIVE_FILE) ||
+ lruvec_page_state_local(lruvec, NR_ACTIVE_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
- lruvec_page_state(lruvec, NR_ACTIVE_ANON))
+ if (lruvec_page_state_local(lruvec, NR_INACTIVE_ANON) ||
+ lruvec_page_state_local(lruvec, NR_ACTIVE_ANON))
return true;
return false;
@@ -2961,16 +2963,16 @@ static void accumulate_vmstats(struct mem_cgroup *memcg,
for_each_mem_cgroup_tree(mi, memcg) {
for (i = 0; i < acc->vmstats_size; i++)
- acc->vmstats[i] += memcg_page_state(mi,
+ acc->vmstats[i] += memcg_page_state_local(mi,
acc->vmstats_array ? acc->vmstats_array[i] : i);
for (i = 0; i < acc->vmevents_size; i++)
- acc->vmevents[i] += memcg_sum_events(mi,
+ acc->vmevents[i] += memcg_events_local(mi,
acc->vmevents_array
? acc->vmevents_array[i] : i);
for (i = 0; i < NR_LRU_LISTS; i++)
- acc->lru_pages[i] += memcg_page_state(mi,
+ acc->lru_pages[i] += memcg_page_state_local(mi,
NR_LRU_BASE + i);
}
}
@@ -2983,10 +2985,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg) {
- val += memcg_page_state(iter, MEMCG_CACHE);
- val += memcg_page_state(iter, MEMCG_RSS);
+ val += memcg_page_state_local(iter, MEMCG_CACHE);
+ val += memcg_page_state_local(iter, MEMCG_RSS);
if (swap)
- val += memcg_page_state(iter, MEMCG_SWAP);
+ val += memcg_page_state_local(iter, MEMCG_SWAP);
}
} else {
if (!swap)
@@ -3328,7 +3330,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
}
return nr;
}
@@ -3342,7 +3344,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
}
return nr;
}
@@ -3427,17 +3429,17 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state(memcg, memcg1_stats[i]) *
+ memcg_page_state_local(memcg, memcg1_stats[i]) *
PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "%s %lu\n", memcg1_event_names[i],
- memcg_sum_events(memcg, memcg1_events[i]));
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
- memcg_page_state(memcg, NR_LRU_BASE + i) *
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
/* Hierarchical information */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d96c54703948..7acd0afdfc2a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
int zid;
if (!mem_cgroup_disabled())
- lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
@@ -2150,7 +2150,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
* is being established. Disable active list protection to get
* rid of the stale workingset quickly.
*/
- refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
if (file && actual_reclaim && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
@@ -2912,7 +2912,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
- refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 6419baebd306..e0b4edcb88c8 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -430,9 +430,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
- pages += lruvec_page_state(lruvec, NR_LRU_BASE + i);
- pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
- pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
+ pages += lruvec_page_state_local(lruvec,
+ NR_LRU_BASE + i);
+ pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
+ pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
} else
#endif
pages = node_present_pages(sc->nid);