From b8e57efa2c98cc56c49461c4950cf026422c29e9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 5 Oct 2018 15:52:10 -0700 Subject: mm/vmscan.c: fix int overflow in callers of do_shrink_slab() do_shrink_slab() returns unsigned long value, and the placing into int variable cuts high bytes off. Then we compare ret and 0xfffffffe (since SHRINK_EMPTY is converted to ret type). Thus a large number of objects returned by do_shrink_slab() may be interpreted as SHRINK_EMPTY, if low bytes of their value are equal to 0xfffffffe. Fix that by declaration ret as unsigned long in these functions. Link: http://lkml.kernel.org/r/153813407177.17544.14888305435570723973.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reported-by: Cyrill Gorcunov Acked-by: Cyrill Gorcunov Reviewed-by: Josef Bacik Cc: Michal Hocko Cc: Andrey Ryabinin Cc: Johannes Weiner Cc: Tetsuo Handa Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index c7ce2c161225..c5ef7240cbcb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -580,8 +580,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { struct memcg_shrinker_map *map; - unsigned long freed = 0; - int ret, i; + unsigned long ret, freed = 0; + int i; if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) return 0; @@ -677,9 +677,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { + unsigned long ret, freed = 0; struct shrinker *shrinker; - unsigned long freed = 0; - int ret; if (!mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); -- cgit v1.2.3 From 4e17ec250fce0eba9b70a91c9622da2748a3ec50 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 29 Nov 2017 08:32:39 -0500 Subject: mm: Convert delete_from_swap_cache to XArray Both callers of __delete_from_swap_cache have the swp_entry_t already, so pass that in to make constructing the XA_STATE easier. Signed-off-by: Matthew Wilcox --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index c7ce2c161225..80f731cf974e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -923,7 +923,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { -- cgit v1.2.3 From 67891ffff2f5cf10e89e348207f687a05c8bd2d6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sun, 10 Jun 2018 07:34:39 -0400 Subject: mm: Convert is_page_cache_freeable to XArray This is just a variable rename and comment change. Signed-off-by: Matthew Wilcox --- mm/vmscan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 80f731cf974e..f9cc86e91812 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -742,12 +742,12 @@ static inline int is_page_cache_freeable(struct page *page) { /* * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache radix tree and - * optional buffer heads at page->private. + * that isolated the page, the page cache and optional buffer + * heads at page->private. */ - int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ? HPAGE_PMD_NR : 1; - return page_count(page) - page_has_private(page) == 1 + radix_pins; + return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) -- cgit v1.2.3 From 68600f623d69da428c6163275f97ca126e1a8ec5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Oct 2018 15:03:27 -0700 Subject: mm: don't miss the last page because of round-off error I've noticed, that dying memory cgroups are often pinned in memory by a single pagecache page. Even under moderate memory pressure they sometimes stayed in such state for a long time. That looked strange. My investigation showed that the problem is caused by applying the LRU pressure balancing math: scan = div64_u64(scan * fraction[lru], denominator), where denominator = fraction[anon] + fraction[file] + 1. Because fraction[lru] is always less than denominator, if the initial scan size is 1, the result is always 0. This means the last page is not scanned and has no chances to be reclaimed. Fix this by rounding up the result of the division. In practice this change significantly improves the speed of dying cgroups reclaim. [guro@fb.com: prevent double calculation of DIV64_U64_ROUND_UP() arguments] Link: http://lkml.kernel.org/r/20180829213311.GA13501@castle Link: http://lkml.kernel.org/r/20180827162621.30187-3-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Rik van Riel Cc: Konstantin Khlebnikov Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index c5ef7240cbcb..961401c46334 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2456,9 +2456,11 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. + * Make sure we don't miss the last page + * because of a round-off error. */ - scan = div64_u64(scan * fraction[file], - denominator); + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); break; case SCAN_FILE: case SCAN_ANON: -- cgit v1.2.3 From 1899ad18c6072d689896badafb81267b0a1092a4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:04 -0700 Subject: mm: workingset: tell cache transitions from workingset thrashing Refaults happen during transitions between workingsets as well as in-place thrashing. Knowing the difference between the two has a range of applications, including measuring the impact of memory shortage on the system performance, as well as the ability to smarter balance pressure between the filesystem cache and the swap-backed workingset. During workingset transitions, inactive cache refaults and pushes out established active cache. When that active cache isn't stale, however, and also ends up refaulting, that's bonafide thrashing. Introduce a new page flag that tells on eviction whether the page has been active or not in its lifetime. This bit is then stored in the shadow entry, to classify refaults as transitioning or thrashing. How many page->flags does this leave us with on 32-bit? 20 bits are always page flags 21 if you have an MMU 23 with the zone bits for DMA, Normal, HighMem, Movable 29 with the sparsemem section bits 30 if PAE is enabled 31 with this patch. So on 32-bit PAE, that leaves 1 bit for distinguishing two NUMA nodes. If that's not enough, the system can switch to discontigmem and re-gain the 6 or 7 sparsemem section bits. Link: http://lkml.kernel.org/r/20180828172258.3185-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 961401c46334..87e9fef341d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2145,6 +2145,7 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + SetPageWorkingset(page); list_add(&page->lru, &l_inactive); } -- cgit v1.2.3 From eb414681d5a07d28d2ff90dc05f69ec6b232ebd2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:27 -0700 Subject: psi: pressure stall information for CPU, memory, and IO When systems are overcommitted and resources become contended, it's hard to tell exactly the impact this has on workload productivity, or how close the system is to lockups and OOM kills. In particular, when machines work multiple jobs concurrently, the impact of overcommit in terms of latency and throughput on the individual job can be enormous. In order to maximize hardware utilization without sacrificing individual job health or risk complete machine lockups, this patch implements a way to quantify resource pressure in the system. A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that expose the percentage of time the system is stalled on CPU, memory, or IO, respectively. Stall states are aggregate versions of the per-task delay accounting delays: cpu: some tasks are runnable but not executing on a CPU memory: tasks are reclaiming, or waiting for swapin or thrashing cache io: tasks are waiting for io completions These percentages of walltime can be thought of as pressure percentages, and they give a general sense of system health and productivity loss incurred by resource overcommit. They can also indicate when the system is approaching lockup scenarios and OOMs. To do this, psi keeps track of the task states associated with each CPU and samples the time they spend in stall states. Every 2 seconds, the samples are averaged across CPUs - weighted by the CPUs' non-idle time to eliminate artifacts from unused CPUs - and translated into percentages of walltime. A running average of those percentages is maintained over 10s, 1m, and 5m periods (similar to the loadaverage). [hannes@cmpxchg.org: doc fixlet, per Randy] Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org [hannes@cmpxchg.org: code optimization] Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org [hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter] Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org [hannes@cmpxchg.org: fix build] Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 87e9fef341d2..8ea87586925e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -3305,6 +3306,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, { struct zonelist *zonelist; unsigned long nr_reclaimed; + unsigned long pflags; int nid; unsigned int noreclaim_flag; struct scan_control sc = { @@ -3333,9 +3335,13 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); + psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3500,6 +3506,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) int i; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long pflags; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -3510,6 +3517,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_swap = 1, }; + psi_memstall_enter(&pflags); __fs_reclaim_acquire(); count_vm_event(PAGEOUTRUN); @@ -3611,6 +3619,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); + psi_memstall_leave(&pflags); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller -- cgit v1.2.3 From 4b85afbdacd290c7a22c96df40a6433fdcacb509 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:42 -0700 Subject: mm: zero-seek shrinkers The page cache and most shrinkable slab caches hold data that has been read from disk, but there are some caches that only cache CPU work, such as the dentry and inode caches of procfs and sysfs, as well as the subset of radix tree nodes that track non-resident page cache. Currently, all these are shrunk at the same rate: using DEFAULT_SEEKS for the shrinker's seeks setting tells the reclaim algorithm that for every two page cache pages scanned it should scan one slab object. This is a bogus setting. A virtual inode that required no IO to create is not twice as valuable as a page cache page; shadow cache entries with eviction distances beyond the size of memory aren't either. In most cases, the behavior in practice is still fine. Such virtual caches don't tend to grow and assert themselves aggressively, and usually get picked up before they cause problems. But there are scenarios where that's not true. Our database workloads suffer from two of those. For one, their file workingset is several times bigger than available memory, which has the kernel aggressively create shadow page cache entries for the non-resident parts of it. The workingset code does tell the VM that most of these are expendable, but the VM ends up balancing them 2:1 to cache pages as per the seeks setting. This is a huge waste of memory. These workloads also deal with tens of thousands of open files and use /proc for introspection, which ends up growing the proc_inode_cache to absurdly large sizes - again at the cost of valuable cache space, which isn't a reasonable trade-off, given that proc inodes can be re-created without involving the disk. This patch implements a "zero-seek" setting for shrinkers that results in a target ratio of 0:1 between their objects and IO-backed caches. This allows such virtual caches to grow when memory is available (they do cache/avoid CPU work after all), but effectively disables them as soon as IO-backed objects are under pressure. It then switches the shrinkers for procfs and sysfs metadata, as well as excess page cache shadow nodes, to the new zero-seek setting. Link: http://lkml.kernel.org/r/20181009184732.762-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Domas Mituzas Reviewed-by: Andrew Morton Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ea87586925e..28c9ae5633b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -474,9 +474,18 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } /* * Make sure we apply some minimal pressure on default priority -- cgit v1.2.3