From b8dd3ee9cacc2398700e5227931988931871e020 Mon Sep 17 00:00:00 2001 From: xupanda Date: Mon, 15 Aug 2022 06:51:04 +0000 Subject: mm: memcontrol: fix a typo in comment Fix a spelling mistake in comment. Link: https://lkml.kernel.org/r/20220815065102.74347-1-xu.panda@zte.com.cn Reported-by: Zeal Robot Signed-off-by: xupanda Signed-off-by: CGEL ZTE Reviewed-by: Mukesh Ojha Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b69979c9ced5..4dddd8be320a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1143,7 +1143,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } while ((memcg = parent_mem_cgroup(memcg))); /* - * When cgruop1 non-hierarchy mode is used, + * When cgroup1 non-hierarchy mode is used, * parent_mem_cgroup() does not walk all the way up to the * cgroup root (root_mem_cgroup). So we have to handle * dead_memcg from cgroup root separately. -- cgit v1.2.3 From e09b0b61fbbf28e0f528ca17af5c3e445109f147 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 16 Aug 2022 11:58:01 -0700 Subject: mm: memcg: export workingset refault stats for cgroup v1 Workingset refault stats are important and useful metrics to measure how well reclaimer and swapping work and how healthy the services are, but they are just available for cgroup v2. There are still plenty users with cgroup v1, export the stats for cgroup v1. Link: https://lkml.kernel.org/r/20220816185801.651091-1-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4dddd8be320a..403af5f7a2b9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3975,6 +3975,8 @@ static const unsigned int memcg1_stats[] = { NR_FILE_MAPPED, NR_FILE_DIRTY, NR_WRITEBACK, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, MEMCG_SWAP, }; @@ -3988,6 +3990,8 @@ static const char *const memcg1_stat_names[] = { "mapped_file", "dirty", "writeback", + "workingset_refault_anon", + "workingset_refault_file", "swap", }; @@ -4016,7 +4020,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4047,7 +4052,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)nr * PAGE_SIZE); + (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) -- cgit v1.2.3 From ec1c86b25f4bdd9dce6436c0539d2a6ae676e1c4 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:02 -0600 Subject: mm: multi-gen LRU: groundwork MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Evictable pages are divided into multiple generations for each lruvec. The youngest generation number is stored in lrugen->max_seq for both anon and file types as they are aged on an equal footing. The oldest generation numbers are stored in lrugen->min_seq[] separately for anon and file types as clean file pages can be evicted regardless of swap constraints. These three variables are monotonically increasing. Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into the gen counter in folio->flags. Each truncated generation number is an index to lrugen->lists[]. The sliding window technique is used to track at least MIN_NR_GENS and at most MAX_NR_GENS generations. The gen counter stores a value within [1, MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it stores 0. There are two conceptually independent procedures: "the aging", which produces young generations, and "the eviction", which consumes old generations. They form a closed-loop system, i.e., "the page reclaim". Both procedures can be invoked from userspace for the purposes of working set estimation and proactive reclaim. These techniques are commonly used to optimize job scheduling (bin packing) in data centers [1][2]. To avoid confusion, the terms "hot" and "cold" will be applied to the multi-gen LRU, as a new convention; the terms "active" and "inactive" will be applied to the active/inactive LRU, as usual. The protection of hot pages and the selection of cold pages are based on page access channels and patterns. There are two access channels: one through page tables and the other through file descriptors. The protection of the former channel is by design stronger because: 1. The uncertainty in determining the access patterns of the former channel is higher due to the approximation of the accessed bit. 2. The cost of evicting the former channel is higher due to the TLB flushes required and the likelihood of encountering the dirty bit. 3. The penalty of underprotecting the former channel is higher because applications usually do not prepare themselves for major page faults like they do for blocked I/O. E.g., GUI applications commonly use dedicated I/O threads to avoid blocking rendering threads. There are also two access patterns: one with temporal locality and the other without. For the reasons listed above, the former channel is assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is present; the latter channel is assumed to follow the latter pattern unless outlying refaults have been observed [3][4]. The next patch will address the "outlying refaults". Three macros, i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in this patch to make the entire patchset less diffy. A page is added to the youngest generation on faulting. The aging needs to check the accessed bit at least twice before handing this page over to the eviction. The first check takes care of the accessed bit set on the initial fault; the second check makes sure this page has not been used since then. This protocol, AKA second chance, requires a minimum of two generations, hence MIN_NR_GENS. [1] https://dl.acm.org/doi/10.1145/3297858.3304053 [2] https://dl.acm.org/doi/10.1145/3503222.3507731 [3] https://lwn.net/Articles/495543/ [4] https://lwn.net/Articles/815342/ Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 403af5f7a2b9..937141d48221 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5175,6 +5175,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { + lru_gen_exit_memcg(memcg); memcg_wb_domain_exit(memcg); __mem_cgroup_free(memcg); } @@ -5233,6 +5234,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + lru_gen_init_memcg(memcg); return memcg; fail: mem_cgroup_id_remove(memcg); -- cgit v1.2.3 From 018ee47f14893d500131dfca2ff9f3ff8ebd4ed2 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:04 -0600 Subject: mm: multi-gen LRU: exploit locality in rmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Searching the rmap for PTEs mapping each page on an LRU list (to test and clear the accessed bit) can be expensive because pages from different VMAs (PA space) are not cache friendly to the rmap (VA space). For workloads mostly using mapped pages, searching the rmap can incur the highest CPU cost in the reclaim path. This patch exploits spatial locality to reduce the trips into the rmap. When shrink_page_list() walks the rmap and finds a young PTE, a new function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent PTEs. On finding another young PTE, it clears the accessed bit and updates the gen counter of the page mapped by this PTE to (max_seq%MAX_NR_GENS)+1. Server benchmark results: Single workload: fio (buffered I/O): no change Single workload: memcached (anon): +[3, 5]% Ops/sec KB/sec patch1-6: 1106168.46 43025.04 patch1-7: 1147696.57 44640.29 Configurations: no change Client benchmark results: kswapd profiles: patch1-6 39.03% lzo1x_1_do_compress (real work) 18.47% page_vma_mapped_walk (overhead) 6.74% _raw_spin_unlock_irq 3.97% do_raw_spin_lock 2.49% ptep_clear_flush 2.48% anon_vma_interval_tree_iter_first 1.92% folio_referenced_one 1.88% __zram_bvec_write 1.48% memmove 1.31% vma_interval_tree_iter_next patch1-7 48.16% lzo1x_1_do_compress (real work) 8.20% page_vma_mapped_walk (overhead) 7.06% _raw_spin_unlock_irq 2.92% ptep_clear_flush 2.53% __zram_bvec_write 2.11% do_raw_spin_lock 2.02% memmove 1.93% lru_gen_look_around 1.56% free_unref_page_list 1.40% memset Configurations: no change Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Barry Song Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 937141d48221..4ea49113b0dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2789,6 +2789,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * - LRU isolation * - lock_page_memcg() * - exclusive reference + * - mem_cgroup_trylock_pages() */ folio->memcg_data = (unsigned long)memcg; } -- cgit v1.2.3 From bd74fdaea146029e4fa12c6de89adbe0779348a9 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sun, 18 Sep 2022 02:00:05 -0600 Subject: mm: multi-gen LRU: support page table walks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To further exploit spatial locality, the aging prefers to walk page tables to search for young PTEs and promote hot pages. A kill switch will be added in the next patch to disable this behavior. When disabled, the aging relies on the rmap only. NB: this behavior has nothing similar with the page table scanning in the 2.4 kernel [1], which searches page tables for old PTEs, adds cold pages to swapcache and unmaps them. To avoid confusion, the term "iteration" specifically means the traversal of an entire mm_struct list; the term "walk" will be applied to page tables and the rmap, as usual. An mm_struct list is maintained for each memcg, and an mm_struct follows its owner task to the new memcg when this task is migrated. Given an lruvec, the aging iterates lruvec_memcg()->mm_list and calls walk_page_range() with each mm_struct on this list to promote hot pages before it increments max_seq. When multiple page table walkers iterate the same list, each of them gets a unique mm_struct; therefore they can run concurrently. Page table walkers ignore any misplaced pages, e.g., if an mm_struct was migrated, pages it left in the previous memcg will not be promoted when its current memcg is under reclaim. Similarly, page table walkers will not promote pages from nodes other than the one under reclaim. This patch uses the following optimizations when walking page tables: 1. It tracks the usage of mm_struct's between context switches so that page table walkers can skip processes that have been sleeping since the last iteration. 2. It uses generational Bloom filters to record populated branches so that page table walkers can reduce their search space based on the query results, e.g., to skip page tables containing mostly holes or misplaced pages. 3. It takes advantage of the accessed bit in non-leaf PMD entries when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. 4. It does not zigzag between a PGD table and the same PMD table spanning multiple VMAs. IOW, it finishes all the VMAs within the range of the same PMD table before it returns to a PGD table. This improves the cache performance for workloads that have large numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5. Server benchmark results: Single workload: fio (buffered I/O): no change Single workload: memcached (anon): +[8, 10]% Ops/sec KB/sec patch1-7: 1147696.57 44640.29 patch1-8: 1245274.91 48435.66 Configurations: no change Client benchmark results: kswapd profiles: patch1-7 48.16% lzo1x_1_do_compress (real work) 8.20% page_vma_mapped_walk (overhead) 7.06% _raw_spin_unlock_irq 2.92% ptep_clear_flush 2.53% __zram_bvec_write 2.11% do_raw_spin_lock 2.02% memmove 1.93% lru_gen_look_around 1.56% free_unref_page_list 1.40% memset patch1-8 49.44% lzo1x_1_do_compress (real work) 6.19% page_vma_mapped_walk (overhead) 5.97% _raw_spin_unlock_irq 3.13% get_pfn_folio 2.85% ptep_clear_flush 2.42% __zram_bvec_write 2.08% do_raw_spin_lock 1.92% memmove 1.44% alloc_zspage 1.36% memset Configurations: no change Thanks to the following developers for their efforts [3]. kernel test robot [1] https://lwn.net/Articles/23732/ [2] https://llvm.org/docs/ScudoHardenedAllocator.html [3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/ Link: https://lkml.kernel.org/r/20220918080010.2920238-9-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Brian Geffon Acked-by: Jan Alexander Steffens (heftig) Acked-by: Oleksandr Natalenko Acked-by: Steven Barrett Acked-by: Suleiman Souhlal Tested-by: Daniel Byrne Tested-by: Donald Carr Tested-by: Holger Hoffstätte Tested-by: Konstantin Kharlamov Tested-by: Shuang Zhai Tested-by: Sofia Trinh Tested-by: Vaibhav Jain Cc: Andi Kleen Cc: Aneesh Kumar K.V Cc: Barry Song Cc: Catalin Marinas Cc: Dave Hansen Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Qi Zheng Cc: Tejun Heo Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memcontrol.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4ea49113b0dd..392b1fd1e8c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6204,6 +6204,30 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_LRU_GEN +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + + /* find the first leader if there is any */ + cgroup_taskset_for_each_leader(task, css, tset) + break; + + if (!task) + return; + + task_lock(task); + if (task->mm && READ_ONCE(task->mm->owner) == task) + lru_gen_migrate_mm(task->mm); + task_unlock(task); +} +#else +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ +} +#endif /* CONFIG_LRU_GEN */ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6609,6 +6633,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .dfl_cftypes = memory_files, -- cgit v1.2.3 From ba0aff8ea6ff0ba4dacfc896facadf3d91c8cd8a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:01 +0000 Subject: mm/memcontrol: stop using mm->highest_vm_end Pass through ULONG_MAX instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-56-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 392b1fd1e8c4..e804056422db 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5879,7 +5879,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; mmap_read_lock(mm); - walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); + walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); mmap_read_unlock(mm); precharge = mc.precharge; @@ -6177,9 +6177,7 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, - NULL); - + walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); mmap_read_unlock(mc.mm); atomic_dec(&mc.from->moving_account); } -- cgit v1.2.3 From 6599591816f522c1cc8ec4eb5cea75738963756a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:12 +0100 Subject: memcg: convert mem_cgroup_swapin_charge_page() to mem_cgroup_swapin_charge_folio() All callers now have a folio, so pass it in here and remove an unnecessary call to page_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e804056422db..621b4472c409 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6844,21 +6844,20 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin - * @page: page to charge + * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. + * @folio: folio to charge. * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the page is allocated + * @entry: swap entry for which the folio is allocated * - * This function charges a page allocated for swapin. Please call this before - * adding the page to the swapcache. + * This function charges a folio allocated for swapin. Please call this before + * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { - struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; -- cgit v1.2.3 From cb691e2f28bc63b1a872aa593dd542ee796e8364 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:34 +0100 Subject: mm: remove lookup_swap_cache() All callers have now been converted to swap_cache_get_folio(), so we can remove this wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-39-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 621b4472c409..9863fb588972 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5569,7 +5569,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Because lookup_swap_cache() updates some statistics counter, + * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); -- cgit v1.2.3 From 9202d527b715f67bcdccbb9b712b65fe053f8109 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:43 +0100 Subject: memcg: convert mem_cgroup_swap_full() to take a folio All callers now have a folio, so convert the function to take a folio. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-48-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9863fb588972..632402001bca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7406,18 +7406,18 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; } -bool mem_cgroup_swap_full(struct page *page) +bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) return false; -- cgit v1.2.3 From 410f8e82689e1e66044fea51ef852054a09502b7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:35 +0000 Subject: memcg: extract memcg_vmstats from struct mem_cgroup Patch series "memcg: reduce memory overhead of memory cgroups". Currently a lot of memory is wasted to maintain the vmevents for memory cgroups as we have multiple arrays of size NR_VM_EVENT_ITEMS which can be as large as 110. However memcg code uses small portion of those entries. This patch series eliminate this overhead by removing the unneeded vmevent entries from memory cgroup data structures. This patch (of 3): This is a preparatory patch to reduce the memory overhead of memory cgroup. The struct memcg_vmstats is the largest object embedded into the struct mem_cgroup. This patch extracts struct memcg_vmstats from struct mem_cgroup to ease the following patches in reducing the size of struct memcg_vmstats. Link: https://lkml.kernel.org/r/20220907043537.3457014-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20220907043537.3457014-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 632402001bca..0a44a733bb03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -669,6 +669,40 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +struct memcg_vmstats_percpu { + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_VM_EVENT_ITEMS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_VM_EVENT_ITEMS]; +}; + +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats->state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -827,7 +861,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats.events[event]); + return READ_ONCE(memcg->vmstats->events[event]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) @@ -5170,6 +5204,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); } @@ -5199,6 +5234,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + if (!memcg->vmstats) + goto fail; + memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5418,9 +5457,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) * below us. We're in a per-cpu loop here and this is * a global counter, so the first cycle will get them. */ - delta = memcg->vmstats.state_pending[i]; + delta = memcg->vmstats->state_pending[i]; if (delta) - memcg->vmstats.state_pending[i] = 0; + memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ v = READ_ONCE(statc->state[i]); @@ -5433,15 +5472,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) continue; /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats.state[i] += delta; + memcg->vmstats->state[i] += delta; if (parent) - parent->vmstats.state_pending[i] += delta; + parent->vmstats->state_pending[i] += delta; } for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - delta = memcg->vmstats.events_pending[i]; + delta = memcg->vmstats->events_pending[i]; if (delta) - memcg->vmstats.events_pending[i] = 0; + memcg->vmstats->events_pending[i] = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { @@ -5452,9 +5491,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (!delta) continue; - memcg->vmstats.events[i] += delta; + memcg->vmstats->events[i] += delta; if (parent) - parent->vmstats.events_pending[i] += delta; + parent->vmstats->events_pending[i] += delta; } for_each_node_state(nid, N_MEMORY) { -- cgit v1.2.3 From d396def5d86dbeb4ceb4a9dca92611ce206dc66a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:36 +0000 Subject: memcg: rearrange code This is a preparatory patch for easing the review of the follow up patch which will reduce the memory overhead of memory cgroups. Link: https://lkml.kernel.org/r/20220907043537.3457014-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0a44a733bb03..78fd7cfb4f92 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -669,6 +669,29 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, +#endif +}; + struct memcg_vmstats_percpu { /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; @@ -1501,29 +1524,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -/* Subset of vm_event_item to report for memcg event stats */ -static const unsigned int memcg_vm_event_stat[] = { - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGFAULT, - PGMAJFAULT, - PGREFILL, - PGACTIVATE, - PGDEACTIVATE, - PGLAZYFREE, - PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) - ZSWPIN, - ZSWPOUT, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - THP_FAULT_ALLOC, - THP_COLLAPSE_ALLOC, -#endif -}; - static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) { struct seq_buf s; -- cgit v1.2.3 From 8278f1c7b4920105f2f30a8df9b8212b378101d2 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:37 +0000 Subject: memcg: reduce size of memcg vmstats structures The struct memcg_vmstats and struct memcg_vmstats_percpu contains two arrays each for events of size NR_VM_EVENT_ITEMS which can be as large as 110. However the memcg v1 only uses 4 of those while memcg v2 uses 15. The union of both is 17. On a 64 bit system, we are wasting approximately ((110 - 17) * 8 * 2) * (nr_cpus + 1) bytes which is significant on large machines. This patch reduces the size of the given structures by adding one indirection and only stores array of events which are actually used by the memcg code. With this patch, the size of memcg_vmstats has reduced from 2544 bytes to 1056 bytes while the size of memcg_vmstats_percpu has reduced from 2568 bytes to 1080 bytes. [akpm@linux-foundation.org: fix memcg_events_local() array index, per Shakeel] Link: https://lkml.kernel.org/r/CALvZod70Mvxr+Nzb6k0yiU2RFYjTD=0NFhKK-Eyp+5ejd1PSFw@mail.gmail.com Link: https://lkml.kernel.org/r/20220907043537.3457014-4-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 54 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78fd7cfb4f92..1f204a262054 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -671,6 +671,8 @@ static void flush_memcg_stats_dwork(struct work_struct *w) /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { + PGPGIN, + PGPGOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSTEAL_KSWAPD, @@ -692,14 +694,30 @@ static const unsigned int memcg_vm_event_stat[] = { #endif }; +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) +static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; + +static void init_memcg_events(void) +{ + int i; + + for (i = 0; i < NR_MEMCG_EVENTS; ++i) + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; +} + +static inline int memcg_events_index(enum vm_event_item idx) +{ + return mem_cgroup_events_index[idx] - 1; +} + struct memcg_vmstats_percpu { /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long events[NR_MEMCG_EVENTS]; /* Delta calculation for lockless upward propagation */ long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_VM_EVENT_ITEMS]; + unsigned long events_prev[NR_MEMCG_EVENTS]; /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; @@ -709,11 +727,11 @@ struct memcg_vmstats_percpu { struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long events[NR_MEMCG_EVENTS]; /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_VM_EVENT_ITEMS]; + unsigned long events_pending[NR_MEMCG_EVENTS]; }; unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) @@ -873,27 +891,37 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - if (mem_cgroup_disabled()) + int index = memcg_events_index(idx); + + if (mem_cgroup_disabled() || index < 0) return; memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + __this_cpu_add(memcg->vmstats_percpu->events[index], count); memcg_rstat_updated(memcg, count); memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats->events[event]); + int index = memcg_events_index(event); + + if (index < 0) + return 0; + return READ_ONCE(memcg->vmstats->events[index]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { long x = 0; int cpu; + int index = memcg_events_index(event); + + if (index < 0) + return 0; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[index], cpu); return x; } @@ -1564,10 +1592,15 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT)); - for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) + for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { + if (memcg_vm_event_stat[i] == PGPGIN || + memcg_vm_event_stat[i] == PGPGOUT) + continue; + seq_buf_printf(&s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); + } /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); @@ -5309,6 +5342,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { + init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); @@ -5477,7 +5511,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) parent->vmstats->state_pending[i] += delta; } - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + for (i = 0; i < NR_MEMCG_EVENTS; i++) { delta = memcg->vmstats->events_pending[i]; if (delta) memcg->vmstats->events_pending[i] = 0; -- cgit v1.2.3 From 4988fe69527c6e02066aeb454c2db4d6d51d317b Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 13 Sep 2022 15:13:58 +0800 Subject: mm/memcontrol: use kstrtobool for swapaccount param parsing Use kstrtobool which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off". Link: https://lkml.kernel.org/r/20220913071358.1812206-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f204a262054..ac6440daf208 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7507,10 +7507,10 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - if (!strcmp(s, "1")) - cgroup_memory_noswap = false; - else if (!strcmp(s, "0")) - cgroup_memory_noswap = true; + bool res; + + if (!kstrtobool(s, &res)) + cgroup_memory_noswap = !res; return 1; } __setup("swapaccount=", setup_swap_account); -- cgit v1.2.3 From c1b8fdae62e59904ecdfe4f50410575ea02fec18 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Sep 2022 02:06:34 +0800 Subject: mm: memcontrol: make cgroup_memory_noswap a static key cgroup_memory_noswap is used in many hot path, so make it a static key to lower the kernel overhead. Using 8G of ZRAM as SWAP, benchmark using `perf stat -d -d -d --repeat 100` with the following code snip in a non-root cgroup: #include #include #include #include #define MB 1024UL * 1024UL int main(int argc, char **argv){ void *p = mmap(NULL, 8000 * MB, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memset(p, 0xff, 8000 * MB); madvise(p, 8000 * MB, MADV_PAGEOUT); memset(p, 0xff, 8000 * MB); return 0; } Before: 7,021.43 msec task-clock # 0.967 CPUs utilized ( +- 0.03% ) 4,010 context-switches # 573.853 /sec ( +- 0.01% ) 0 cpu-migrations # 0.000 /sec 2,052,057 page-faults # 293.661 K/sec ( +- 0.00% ) 12,616,546,027 cycles # 1.805 GHz ( +- 0.06% ) (39.92%) 156,823,666 stalled-cycles-frontend # 1.25% frontend cycles idle ( +- 0.10% ) (40.25%) 310,130,812 stalled-cycles-backend # 2.47% backend cycles idle ( +- 4.39% ) (40.73%) 18,692,516,591 instructions # 1.49 insn per cycle # 0.01 stalled cycles per insn ( +- 0.04% ) (40.75%) 4,907,447,976 branches # 702.283 M/sec ( +- 0.05% ) (40.30%) 13,002,578 branch-misses # 0.26% of all branches ( +- 0.08% ) (40.48%) 7,069,786,296 L1-dcache-loads # 1.012 G/sec ( +- 0.03% ) (40.32%) 649,385,847 L1-dcache-load-misses # 9.13% of all L1-dcache accesses ( +- 0.07% ) (40.10%) 1,485,448,688 L1-icache-loads # 212.576 M/sec ( +- 0.15% ) (39.49%) 31,628,457 L1-icache-load-misses # 2.13% of all L1-icache accesses ( +- 0.40% ) (39.57%) 6,667,311 dTLB-loads # 954.129 K/sec ( +- 0.21% ) (39.50%) 5,668,555 dTLB-load-misses # 86.40% of all dTLB cache accesses ( +- 0.12% ) (39.03%) 765 iTLB-loads # 109.476 /sec ( +- 21.81% ) (39.44%) 4,370,351 iTLB-load-misses # 214320.09% of all iTLB cache accesses ( +- 1.44% ) (39.86%) 149,207,254 L1-dcache-prefetches # 21.352 M/sec ( +- 0.13% ) (40.27%) 7.25869 +- 0.00203 seconds time elapsed ( +- 0.03% ) After: 6,576.16 msec task-clock # 0.953 CPUs utilized ( +- 0.10% ) 4,020 context-switches # 605.595 /sec ( +- 0.01% ) 0 cpu-migrations # 0.000 /sec 2,052,056 page-faults # 309.133 K/sec ( +- 0.00% ) 11,967,619,180 cycles # 1.803 GHz ( +- 0.36% ) (38.76%) 161,259,240 stalled-cycles-frontend # 1.38% frontend cycles idle ( +- 0.27% ) (36.58%) 253,605,302 stalled-cycles-backend # 2.16% backend cycles idle ( +- 4.45% ) (34.78%) 19,328,171,892 instructions # 1.65 insn per cycle # 0.01 stalled cycles per insn ( +- 0.10% ) (31.46%) 5,213,967,902 branches # 785.461 M/sec ( +- 0.18% ) (30.68%) 12,385,170 branch-misses # 0.24% of all branches ( +- 0.26% ) (34.13%) 7,271,687,822 L1-dcache-loads # 1.095 G/sec ( +- 0.12% ) (35.29%) 649,873,045 L1-dcache-load-misses # 8.93% of all L1-dcache accesses ( +- 0.11% ) (41.41%) 1,950,037,608 L1-icache-loads # 293.764 M/sec ( +- 0.33% ) (43.11%) 31,365,566 L1-icache-load-misses # 1.62% of all L1-icache accesses ( +- 0.39% ) (45.89%) 6,767,809 dTLB-loads # 1.020 M/sec ( +- 0.47% ) (48.42%) 6,339,590 dTLB-load-misses # 95.43% of all dTLB cache accesses ( +- 0.50% ) (46.60%) 736 iTLB-loads # 110.875 /sec ( +- 1.79% ) (48.60%) 4,314,836 iTLB-load-misses # 518653.73% of all iTLB cache accesses ( +- 0.63% ) (42.91%) 144,950,156 L1-dcache-prefetches # 21.836 M/sec ( +- 0.37% ) (41.39%) 6.89935 +- 0.00703 seconds time elapsed ( +- 0.10% ) The performance is clearly better. There is no significant hotspot improvement according to perf report, as there are quite a few callers of memcg_swap_enabled and do_memsw_account (which calls memcg_swap_enabled). Many pieces of minor optimizations resulted in lower overhead for the branch predictor, and bettter performance. Link: https://lkml.kernel.org/r/20220919180634.45958-3-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Muchun Song Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac6440daf208..6b74bbdc2659 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -90,9 +90,18 @@ static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __ro_after_init; +static bool cgroup_memory_noswap __initdata; + +static DEFINE_STATIC_KEY_FALSE(memcg_swap_enabled_key); +static inline bool memcg_swap_enabled(void) +{ + return static_branch_likely(&memcg_swap_enabled_key); +} #else -#define cgroup_memory_noswap 1 +static inline bool memcg_swap_enabled(void) +{ + return false; +} #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -102,7 +111,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg_swap_enabled(); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -7370,7 +7379,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (!cgroup_memory_noswap && memcg != swap_memcg) { + if (memcg_swap_enabled() && memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7422,7 +7431,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && + if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7454,7 +7463,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { + if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7470,7 +7479,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7487,7 +7496,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (vm_swap_full()) return true; - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = folio_memcg(folio); @@ -7795,6 +7804,8 @@ static int __init mem_cgroup_swap_init(void) if (cgroup_memory_noswap) return 0; + static_branch_enable(&memcg_swap_enabled_key); + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) -- cgit v1.2.3 From c91bdc9358992856721ff77887202a7e80b7ab22 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:01 -0400 Subject: mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled Patch series "memcg swap fix & cleanups". This patch (of 4): Since commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), the cgroup swap arrays are used to track memory ownership at the time of swap readahead and swapoff, even if swap space *accounting* has been turned off by the user via swapaccount=0 (which sets cgroup_memory_noswap). However, the patch was overzealous: by simply dropping the cgroup_memory_noswap conditionals in the swapon, swapoff and uncharge path, it caused the cgroup arrays being allocated even when the memory controller as a whole is disabled. This is a waste of that memory. Restore mem_cgroup_disabled() checks, implied previously by cgroup_memory_noswap, in the swapon, swapoff, and swap_entry_free callbacks. Link: https://lkml.kernel.org/r/20220926135704.400818-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20220926135704.400818-2-hannes@cmpxchg.org Fixes: 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") Signed-off-by: Johannes Weiner Reported-by: Hugh Dickins Reviewed-by: Shakeel Butt Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b74bbdc2659..9e3c010ca676 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7459,6 +7459,9 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); -- cgit v1.2.3 From b25806dcd3d5248833f7d2544ee29a701735159f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:02 -0400 Subject: mm: memcontrol: deprecate swapaccounting=0 mode The swapaccounting= commandline option already does very little today. To close a trivial containment failure case, the swap ownership tracking part of the swap controller has recently become mandatory (see commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") for details), which makes up the majority of the work during swapout, swapin, and the swap slot map. The only thing left under this flag is the page_counter operations and the visibility of the swap control files in the first place, which are rather meager savings. There also aren't many scenarios, if any, where controlling the memory of a cgroup while allowing it unlimited access to a global swap space is a workable resource isolation strategy. On the other hand, there have been several bugs and confusion around the many possible swap controller states (cgroup1 vs cgroup2 behavior, memory accounting without swap accounting, memcg runtime disabled). This puts the maintenance overhead of retaining the toggle above its practical benefits. Deprecate it. Link: https://lkml.kernel.org/r/20220926135704.400818-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 50 ++++++++++---------------------------------------- 1 file changed, 10 insertions(+), 40 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e3c010ca676..4be1b48b9659 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -88,22 +88,6 @@ static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ static bool cgroup_memory_nokmem __ro_after_init; -/* Whether the swap controller is active */ -#ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __initdata; - -static DEFINE_STATIC_KEY_FALSE(memcg_swap_enabled_key); -static inline bool memcg_swap_enabled(void) -{ - return static_branch_likely(&memcg_swap_enabled_key); -} -#else -static inline bool memcg_swap_enabled(void) -{ - return false; -} -#endif - #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -111,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg_swap_enabled(); + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -7379,7 +7363,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (memcg_swap_enabled() && memcg != swap_memcg) { + if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7431,7 +7415,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg) && + if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7466,7 +7450,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg)) { + if (!mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7482,7 +7466,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (mem_cgroup_disabled() || do_memsw_account()) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7499,7 +7483,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (vm_swap_full()) return true; - if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return false; memcg = folio_memcg(folio); @@ -7519,10 +7503,9 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - bool res; - - if (!kstrtobool(s, &res)) - cgroup_memory_noswap = !res; + pr_warn_once("The swapaccount= commandline option is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); return 1; } __setup("swapaccount=", setup_swap_account); @@ -7791,24 +7774,11 @@ static struct cftype zswap_files[] = { }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ -/* - * If mem_cgroup_swap_init() is implemented as a subsys_initcall() - * instead of a core_initcall(), this could mean cgroup_memory_noswap still - * remains set to false even when memcg is disabled via "cgroup_disable=memory" - * boot parameter. This may result in premature OOPS inside - * mem_cgroup_get_nr_swap_pages() function in corner cases. - */ static int __init mem_cgroup_swap_init(void) { - /* No memory control -> no swap control */ if (mem_cgroup_disabled()) - cgroup_memory_noswap = true; - - if (cgroup_memory_noswap) return 0; - static_branch_enable(&memcg_swap_enabled_key); - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) @@ -7816,6 +7786,6 @@ static int __init mem_cgroup_swap_init(void) #endif return 0; } -core_initcall(mem_cgroup_swap_init); +subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_MEMCG_SWAP */ -- cgit v1.2.3 From b94c4e949c36e0e363515822ade0d8305e9a6ef2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:03 -0400 Subject: mm: memcontrol: use do_memsw_account() in a few more places It's slightly more descriptive and consistent with other places that distinguish cgroup1's combined memory+swap accounting scheme from cgroup2's dedicated swap accounting. Link: https://lkml.kernel.org/r/20220926135704.400818-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4be1b48b9659..76bb0a18a2f3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1667,17 +1667,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { unsigned long max = READ_ONCE(memcg->memory.max); - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - if (mem_cgroup_swappiness(memcg)) - max += min(READ_ONCE(memcg->swap.max), - (unsigned long)total_swap_pages); - } else { /* v1 */ + if (do_memsw_account()) { if (mem_cgroup_swappiness(memcg)) { /* Calculate swap excess capacity from memsw limit */ unsigned long swap = READ_ONCE(memcg->memsw.max) - max; max += min(swap, (unsigned long)total_swap_pages); } + } else { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); } return max; } @@ -7334,7 +7334,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (mem_cgroup_disabled()) return; - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!do_memsw_account()) return; memcg = folio_memcg(folio); @@ -7399,7 +7399,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return 0; memcg = folio_memcg(folio); @@ -7451,10 +7451,10 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, nr_pages); - else + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + else + page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); -- cgit v1.2.3 From e55b9f96860f6c6026cff97966a740576285e07b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:04 -0400 Subject: mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol Since 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), CONFIG_MEMCG_SWAP hasn't been a user-visible config option anymore, it just means CONFIG_MEMCG && CONFIG_SWAP. Update the sites accordingly and drop the symbol. [ While touching the docs, remove two references to CONFIG_MEMCG_KMEM, which hasn't been a user-visible symbol for over half a decade. ] Link: https://lkml.kernel.org/r/20220926135704.400818-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 76bb0a18a2f3..61e05fc281fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3423,7 +3423,7 @@ void split_page_memcg(struct page *head, unsigned int nr) css_get_many(&memcg->css, nr - 1); } -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -7296,7 +7296,7 @@ static int __init mem_cgroup_init(void) } subsys_initcall(mem_cgroup_init); -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { @@ -7788,4 +7788,4 @@ static int __init mem_cgroup_swap_init(void) } subsys_initcall(mem_cgroup_swap_init); -#endif /* CONFIG_MEMCG_SWAP */ +#endif /* CONFIG_SWAP */ -- cgit v1.2.3