diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 86 | ||||
-rw-r--r-- | mm/Makefile | 8 | ||||
-rw-r--r-- | mm/cma.c | 57 | ||||
-rw-r--r-- | mm/compaction.c | 47 | ||||
-rw-r--r-- | mm/damon/core.c | 24 | ||||
-rw-r--r-- | mm/damon/dbgfs.c | 2 | ||||
-rw-r--r-- | mm/damon/sysfs.c | 2 | ||||
-rw-r--r-- | mm/damon/tests/.kunitconfig | 22 | ||||
-rw-r--r-- | mm/damon/tests/core-kunit.h (renamed from mm/damon/core-test.h) | 35 | ||||
-rw-r--r-- | mm/damon/tests/dbgfs-kunit.h (renamed from mm/damon/dbgfs-test.h) | 10 | ||||
-rw-r--r-- | mm/damon/tests/sysfs-kunit.h (renamed from mm/damon/sysfs-test.h) | 0 | ||||
-rw-r--r-- | mm/damon/tests/vaddr-kunit.h (renamed from mm/damon/vaddr-test.h) | 2 | ||||
-rw-r--r-- | mm/damon/vaddr.c | 2 | ||||
-rw-r--r-- | mm/debug.c | 31 | ||||
-rw-r--r-- | mm/debug_vm_pgtable.c | 50 | ||||
-rw-r--r-- | mm/filemap.c | 67 | ||||
-rw-r--r-- | mm/folio-compat.c | 12 | ||||
-rw-r--r-- | mm/gup.c | 68 | ||||
-rw-r--r-- | mm/huge_memory.c | 589 | ||||
-rw-r--r-- | mm/hugetlb.c | 442 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 4 | ||||
-rw-r--r-- | mm/hugetlb_vmemmap.c | 40 | ||||
-rw-r--r-- | mm/internal.h | 216 | ||||
-rw-r--r-- | mm/kfence/core.c | 53 | ||||
-rw-r--r-- | mm/kfence/kfence.h | 1 | ||||
-rw-r--r-- | mm/kfence/report.c | 15 | ||||
-rw-r--r-- | mm/khugepaged.c | 75 | ||||
-rw-r--r-- | mm/kmemleak.c | 159 | ||||
-rw-r--r-- | mm/ksm.c | 146 | ||||
-rw-r--r-- | mm/madvise.c | 13 | ||||
-rw-r--r-- | mm/memblock.c | 2 | ||||
-rw-r--r-- | mm/memcontrol-v1.c | 126 | ||||
-rw-r--r-- | mm/memcontrol-v1.h | 26 | ||||
-rw-r--r-- | mm/memcontrol.c | 490 | ||||
-rw-r--r-- | mm/memory-failure.c | 92 | ||||
-rw-r--r-- | mm/memory-tiers.c | 22 | ||||
-rw-r--r-- | mm/memory.c | 562 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 85 | ||||
-rw-r--r-- | mm/mempolicy.c | 8 | ||||
-rw-r--r-- | mm/migrate.c | 270 | ||||
-rw-r--r-- | mm/migrate_device.c | 108 | ||||
-rw-r--r-- | mm/mm_init.c | 12 | ||||
-rw-r--r-- | mm/mmap.c | 2156 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 2 | ||||
-rw-r--r-- | mm/mmzone.c | 2 | ||||
-rw-r--r-- | mm/mprotect.c | 86 | ||||
-rw-r--r-- | mm/mremap.c | 32 | ||||
-rw-r--r-- | mm/mseal.c | 55 | ||||
-rw-r--r-- | mm/nommu.c | 11 | ||||
-rw-r--r-- | mm/numa.c | 69 | ||||
-rw-r--r-- | mm/numa_emulation.c | 571 | ||||
-rw-r--r-- | mm/numa_memblks.c | 571 | ||||
-rw-r--r-- | mm/page-writeback.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 349 | ||||
-rw-r--r-- | mm/page_counter.c | 48 | ||||
-rw-r--r-- | mm/page_io.c | 113 | ||||
-rw-r--r-- | mm/page_isolation.c | 36 | ||||
-rw-r--r-- | mm/pagewalk.c | 202 | ||||
-rw-r--r-- | mm/percpu.c | 31 | ||||
-rw-r--r-- | mm/rmap.c | 71 | ||||
-rw-r--r-- | mm/shmem.c | 450 | ||||
-rw-r--r-- | mm/shmem_quota.c | 3 | ||||
-rw-r--r-- | mm/show_mem.c | 11 | ||||
-rw-r--r-- | mm/shrinker_debug.c | 2 | ||||
-rw-r--r-- | mm/slab_common.c | 27 | ||||
-rw-r--r-- | mm/swap.c | 298 | ||||
-rw-r--r-- | mm/swap.h | 44 | ||||
-rw-r--r-- | mm/swap_cgroup.c | 2 | ||||
-rw-r--r-- | mm/swap_state.c | 78 | ||||
-rw-r--r-- | mm/swapfile.c | 1482 | ||||
-rw-r--r-- | mm/userfaultfd.c | 170 | ||||
-rw-r--r-- | mm/util.c | 102 | ||||
-rw-r--r-- | mm/vma.c | 2068 | ||||
-rw-r--r-- | mm/vma.h | 558 | ||||
-rw-r--r-- | mm/vma_internal.h | 49 | ||||
-rw-r--r-- | mm/vmalloc.c | 139 | ||||
-rw-r--r-- | mm/vmscan.c | 67 | ||||
-rw-r--r-- | mm/vmstat.c | 28 | ||||
-rw-r--r-- | mm/z3fold.c | 2 | ||||
-rw-r--r-- | mm/zsmalloc.c | 15 | ||||
-rw-r--r-- | mm/zswap.c | 307 |
81 files changed, 9148 insertions, 5242 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b72e7d040f78..09aebca1cae3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -128,7 +128,7 @@ config ZSWAP_COMPRESSOR_DEFAULT choice prompt "Default allocator" depends on ZSWAP - default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if HAVE_ZSMALLOC + default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU default ZSWAP_ZPOOL_DEFAULT_ZBUD help Selects the default allocator for the compressed cache for @@ -146,15 +146,17 @@ config ZSWAP_ZPOOL_DEFAULT_ZBUD help Use the zbud allocator as the default allocator. -config ZSWAP_ZPOOL_DEFAULT_Z3FOLD - bool "z3fold" - select Z3FOLD +config ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED + bool "z3foldi (DEPRECATED)" + select Z3FOLD_DEPRECATED help Use the z3fold allocator as the default allocator. + Deprecated and scheduled for removal in a few cycles, + see CONFIG_Z3FOLD_DEPRECATED. + config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC bool "zsmalloc" - depends on HAVE_ZSMALLOC select ZSMALLOC help Use the zsmalloc allocator as the default allocator. @@ -164,7 +166,7 @@ config ZSWAP_ZPOOL_DEFAULT string depends on ZSWAP default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD - default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD + default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC default "" @@ -178,24 +180,29 @@ config ZBUD deterministic reclaim properties that make it preferable to a higher density approach when reclaim will be used. -config Z3FOLD - tristate "3:1 compression allocator (z3fold)" +config Z3FOLD_DEPRECATED + tristate "3:1 compression allocator (z3fold) (DEPRECATED)" depends on ZSWAP help + Deprecated and scheduled for removal in a few cycles. If you have + a good reason for using Z3FOLD over ZSMALLOC, please contact + linux-mm@kvack.org and the zswap maintainers. + A special purpose allocator for storing compressed pages. It is designed to store up to three compressed pages per physical page. It is a ZBUD derivative so the simplicity and determinism are still there. -config HAVE_ZSMALLOC - def_bool y - depends on MMU - depends on PAGE_SIZE_LESS_THAN_256KB # we want <= 64 KiB +config Z3FOLD + tristate + default y if Z3FOLD_DEPRECATED=y + default m if Z3FOLD_DEPRECATED=m + depends on Z3FOLD_DEPRECATED config ZSMALLOC tristate - prompt "N:1 compression allocator (zsmalloc)" if ZSWAP - depends on HAVE_ZSMALLOC + prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM) + depends on MMU help zsmalloc is a slab-based memory allocator designed to store pages of various compression levels efficiently. It achieves @@ -585,17 +592,21 @@ config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE # at the same time (e.g. copy_page_range()). # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # -config SPLIT_PTLOCK_CPUS - int - default "999999" if !MMU - default "999999" if ARM && !CPU_CACHE_VIPT - default "999999" if PARISC && !PA20 - default "999999" if SPARC32 - default "4" +config SPLIT_PTE_PTLOCKS + def_bool y + depends on MMU + depends on NR_CPUS >= 4 + depends on !ARM || CPU_CACHE_VIPT + depends on !PARISC || PA20 + depends on !SPARC32 config ARCH_ENABLE_SPLIT_PMD_PTLOCK bool +config SPLIT_PMD_PTLOCKS + def_bool y + depends on SPLIT_PTE_PTLOCKS && ARCH_ENABLE_SPLIT_PMD_PTLOCK + # # support for memory balloon config MEMORY_BALLOON @@ -877,6 +888,19 @@ endif # TRANSPARENT_HUGEPAGE config PGTABLE_HAS_HUGE_LEAVES def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE +# TODO: Allow to be enabled without THP +config ARCH_SUPPORTS_HUGE_PFNMAP + def_bool n + depends on TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PMD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE + +config ARCH_SUPPORTS_PUD_PFNMAP + def_bool y + depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + # # UP and nommu archs use km based percpu allocator # @@ -1081,13 +1105,10 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool -config ARCH_USES_PG_ARCH_X +config ARCH_USES_PG_ARCH_2 + bool +config ARCH_USES_PG_ARCH_3 bool - help - Enable the definition of PG_arch_x page flags with x > 1. Only - suitable for 64-bit architectures with CONFIG_FLATMEM or - CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be - enough room for additional bits in page->flags. config VM_EVENT_COUNTERS default y @@ -1263,6 +1284,17 @@ config IOMMU_MM_DATA config EXECMEM bool +config NUMA_MEMBLKS + bool + +config NUMA_EMU + bool "NUMA emulation" + depends on NUMA_MEMBLKS + help + Enable NUMA emulation. A flat machine will be split + into virtual nodes when booted with "numa=fake=N", where N is the + number of nodes. This is only useful for debugging. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index d2915f8c9dc0..d5639b036166 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ msync.o page_vma_mapped.o pagewalk.o \ - pgtable-generic.o rmap.o vmalloc.o + pgtable-generic.o rmap.o vmalloc.o vma.o ifdef CONFIG_CROSS_MEMORY_ATTACH @@ -53,7 +53,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shrinker.o \ shmem.o util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o show_mem.o shmem_quota.o\ + compaction.o show_mem.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) @@ -117,6 +117,9 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_Z3FOLD) += z3fold.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o +obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o @@ -141,3 +144,4 @@ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o +obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o @@ -202,7 +202,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, cma->order_per_bit = order_per_bit; *res_cma = cma; cma_area_count++; - totalcma_pages += (size / PAGE_SIZE); + totalcma_pages += cma->count; return 0; } @@ -403,18 +403,8 @@ static void cma_debug_show_areas(struct cma *cma) spin_unlock_irq(&cma->lock); } -/** - * cma_alloc() - allocate pages from contiguous area - * @cma: Contiguous memory region for which the allocation is performed. - * @count: Requested number of pages. - * @align: Requested alignment of pages (in PAGE_SIZE order). - * @no_warn: Avoid printing message about failed allocation - * - * This function allocates part of contiguous memory on specific - * contiguous memory area. - */ -struct page *cma_alloc(struct cma *cma, unsigned long count, - unsigned int align, bool no_warn) +static struct page *__cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, gfp_t gfp) { unsigned long mask, offset; unsigned long pfn = -1; @@ -463,8 +453,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); @@ -494,7 +483,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, page_kasan_tag_reset(nth_page(page, i)); } - if (ret && !no_warn) { + if (ret && !(gfp & __GFP_NOWARN)) { pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", __func__, cma->name, count, ret); cma_debug_show_areas(cma); @@ -513,6 +502,34 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, return page; } +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * @no_warn: Avoid printing message about failed allocation + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) +{ + return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); +} + +struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) +{ + struct page *page; + + if (WARN_ON(!order || !(gfp & __GFP_COMP))) + return NULL; + + page = __cma_alloc(cma, 1 << order, order, gfp); + + return page ? page_folio(page) : NULL; +} + bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count) { @@ -564,6 +581,14 @@ bool cma_release(struct cma *cma, const struct page *pages, return true; } +bool cma_free_folio(struct cma *cma, const struct folio *folio) +{ + if (WARN_ON(!folio_test_large(folio))) + return false; + + return cma_release(cma, &folio->page, folio_nr_pages(folio)); +} + int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) { int i; diff --git a/mm/compaction.c b/mm/compaction.c index eb95e9b435d0..a2b16b08cbbf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -23,6 +23,7 @@ #include <linux/freezer.h> #include <linux/page_owner.h> #include <linux/psi.h> +#include <linux/cpuset.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -86,33 +87,6 @@ static struct page *mark_allocated_noprof(struct page *page, unsigned int order, } #define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) -static void split_map_pages(struct list_head *freepages) -{ - unsigned int i, order; - struct page *page, *next; - LIST_HEAD(tmp_list); - - for (order = 0; order < NR_PAGE_ORDERS; order++) { - list_for_each_entry_safe(page, next, &freepages[order], lru) { - unsigned int nr_pages; - - list_del(&page->lru); - - nr_pages = 1 << order; - - mark_allocated(page, order, __GFP_MOVABLE); - if (order) - split_page(page, order); - - for (i = 0; i < nr_pages; i++) { - list_add(&page->lru, &tmp_list); - page++; - } - } - list_splice_init(&tmp_list, &freepages[0]); - } -} - static unsigned long release_free_list(struct list_head *freepages) { int order; @@ -742,11 +716,11 @@ isolate_fail: * * Non-free pages, invalid PFNs, or zone boundaries within the * [start_pfn, end_pfn) range are considered errors, cause function to - * undo its actions and return zero. + * undo its actions and return zero. cc->freepages[] are empty. * * Otherwise, function returns one-past-the-last PFN of isolated page * (which may be greater then end_pfn if end fell in a middle of - * a free page). + * a free page). cc->freepages[] contain free pages isolated. */ unsigned long isolate_freepages_range(struct compact_control *cc, @@ -754,10 +728,9 @@ isolate_freepages_range(struct compact_control *cc, { unsigned long isolated, pfn, block_start_pfn, block_end_pfn; int order; - struct list_head tmp_freepages[NR_PAGE_ORDERS]; for (order = 0; order < NR_PAGE_ORDERS; order++) - INIT_LIST_HEAD(&tmp_freepages[order]); + INIT_LIST_HEAD(&cc->freepages[order]); pfn = start_pfn; block_start_pfn = pageblock_start_pfn(pfn); @@ -788,7 +761,7 @@ isolate_freepages_range(struct compact_control *cc, break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, tmp_freepages, 0, true); + block_end_pfn, cc->freepages, 0, true); /* * In strict mode, isolate_freepages_block() returns 0 if @@ -807,13 +780,10 @@ isolate_freepages_range(struct compact_control *cc, if (pfn < end_pfn) { /* Loop terminated early, cleanup. */ - release_free_list(tmp_freepages); + release_free_list(cc->freepages); return 0; } - /* __isolate_free_page() does not map the pages */ - split_map_pages(tmp_freepages); - /* We don't use freelists for anything. */ return pfn; } @@ -2853,6 +2823,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, ac->highest_zoneidx, ac->nodemask) { enum compact_result status; + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + if (prio > MIN_COMPACT_PRIORITY && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); diff --git a/mm/damon/core.c b/mm/damon/core.c index 7a87628b76ab..a83f3b736d51 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -552,7 +552,13 @@ static unsigned int damon_accesses_bp_to_nr_accesses( return accesses_bp * damon_max_nr_accesses(attrs) / 10000; } -/* convert nr_accesses to access ratio in bp (per 10,000) */ +/* + * Convert nr_accesses to access ratio in bp (per 10,000). + * + * Callers should ensure attrs.aggr_interval is not zero, like + * damon_update_monitoring_results() does . Otherwise, divide-by-zero would + * happen. + */ static unsigned int damon_nr_accesses_to_accesses_bp( unsigned int nr_accesses, struct damon_attrs *attrs) { @@ -1582,13 +1588,16 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) return; /* Fill up the score histogram */ - memset(quota->histogram, 0, sizeof(quota->histogram)); + memset(c->regions_score_histogram, 0, + sizeof(*c->regions_score_histogram) * + (DAMOS_MAX_SCORE + 1)); damon_for_each_target(t, c) { damon_for_each_region(r, t) { if (!__damos_valid_target(r, s)) continue; score = c->ops.get_scheme_score(c, t, r, s); - quota->histogram[score] += damon_sz_region(r); + c->regions_score_histogram[score] += + damon_sz_region(r); if (score > max_score) max_score = score; } @@ -1596,7 +1605,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) /* Set the min score limit */ for (cumulated_sz = 0, score = max_score; ; score--) { - cumulated_sz += quota->histogram[score]; + cumulated_sz += c->regions_score_histogram[score]; if (cumulated_sz >= quota->esz || !score) break; } @@ -1957,6 +1966,10 @@ static int kdamond_fn(void *data) ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) goto done; + ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1, + sizeof(*ctx->regions_score_histogram), GFP_KERNEL); + if (!ctx->regions_score_histogram) + goto done; sz_limit = damon_region_sz_limit(ctx); @@ -2034,6 +2047,7 @@ done: ctx->callback.before_terminate(ctx); if (ctx->ops.cleanup) ctx->ops.cleanup(ctx); + kfree(ctx->regions_score_histogram); pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); @@ -2205,4 +2219,4 @@ static int __init damon_init(void) subsys_initcall(damon_init); -#include "core-test.h" +#include "tests/core-kunit.h" diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 51a6f1cac385..b4213bc47e44 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -1145,4 +1145,4 @@ out: module_init(damon_dbgfs_init); -#include "dbgfs-test.h" +#include "tests/dbgfs-kunit.h" diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index cffc755e7775..58145d59881d 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1882,4 +1882,4 @@ out: } subsys_initcall(damon_sysfs_init); -#include "sysfs-test.h" +#include "tests/sysfs-kunit.h" diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig new file mode 100644 index 000000000000..a73be044fc9b --- /dev/null +++ b/mm/damon/tests/.kunitconfig @@ -0,0 +1,22 @@ +# for DAMON core +CONFIG_KUNIT=y +CONFIG_DAMON=y +CONFIG_DAMON_KUNIT_TEST=y + +# for DAMON vaddr ops +CONFIG_MMU=y +CONFIG_PAGE_IDLE_FLAG=y +CONFIG_DAMON_VADDR=y +CONFIG_DAMON_VADDR_KUNIT_TEST=y + +# for DAMON sysfs interface +CONFIG_SYSFS=y +CONFIG_DAMON_SYSFS=y +CONFIG_DAMON_SYSFS_KUNIT_TEST=y + +# for DAMON debugfs interface +CONFIG_DEBUG_FS=y +CONFIG_DAMON_PADDR=y +CONFIG_DAMON_DBGFS_DEPRECATED=y +CONFIG_DAMON_DBGFS=y +CONFIG_DAMON_DBGFS_KUNIT_TEST=y diff --git a/mm/damon/core-test.h b/mm/damon/tests/core-kunit.h index 0cee634f3544..cf22e09a3507 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/tests/core-kunit.h @@ -246,16 +246,20 @@ static void damon_test_split_regions_of(struct kunit *test) static void damon_test_ops_registration(struct kunit *test) { struct damon_ctx *c = damon_new_ctx(); - struct damon_operations ops, bak; + struct damon_operations ops = {.id = DAMON_OPS_VADDR}, bak; + bool need_cleanup = false; + + /* DAMON_OPS_VADDR is registered only if CONFIG_DAMON_VADDR is set */ + if (!damon_is_registered_ops(DAMON_OPS_VADDR)) { + bak.id = DAMON_OPS_VADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&bak), 0); + need_cleanup = true; + } - /* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */ + /* DAMON_OPS_VADDR is ensured to be registered */ KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0); - KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0); /* Double-registration is prohibited */ - ops.id = DAMON_OPS_VADDR; - KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); - ops.id = DAMON_OPS_PADDR; KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); /* Unknown ops id cannot be registered */ @@ -278,6 +282,13 @@ static void damon_test_ops_registration(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); damon_destroy_ctx(c); + + if (need_cleanup) { + mutex_lock(&damon_ops_lock); + damon_registered_ops[DAMON_OPS_VADDR] = + (struct damon_operations){}; + mutex_unlock(&damon_ops_lock); + } } static void damon_test_set_regions(struct kunit *test) @@ -309,6 +320,18 @@ static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test) .aggr_interval = ((unsigned long)UINT_MAX + 1) * 10 }; + /* + * In some cases such as 32bit architectures where UINT_MAX is + * ULONG_MAX, attrs.aggr_interval becomes zero. Calling + * damon_nr_accesses_to_accesses_bp() in the case will cause + * divide-by-zero. Such case is prohibited in normal execution since + * the caution is documented on the comment for the function, and + * damon_update_monitoring_results() does the check. Skip the test in + * the case. + */ + if (!attrs.aggr_interval) + kunit_skip(test, "aggr_interval is zero."); + KUNIT_EXPECT_EQ(test, damon_nr_accesses_to_accesses_bp(123, &attrs), 0); } diff --git a/mm/damon/dbgfs-test.h b/mm/damon/tests/dbgfs-kunit.h index 2d85217f5ba4..d2ecfcc8db86 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/tests/dbgfs-kunit.h @@ -73,6 +73,11 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) struct damon_ctx *ctx = dbgfs_new_ctx(); char buf[64]; + if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { + dbgfs_destroy_ctx(ctx); + kunit_skip(test, "PADDR not registered"); + } + /* Make DAMON consider target has no pid */ damon_select_ops(ctx, DAMON_OPS_PADDR); @@ -111,6 +116,11 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) int i, rc; char buf[256]; + if (!damon_is_registered_ops(DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + kunit_skip(test, "PADDR not registered"); + } + damon_select_ops(ctx, DAMON_OPS_PADDR); dbgfs_set_targets(ctx, 3, NULL); diff --git a/mm/damon/sysfs-test.h b/mm/damon/tests/sysfs-kunit.h index 1c9b596057a7..1c9b596057a7 100644 --- a/mm/damon/sysfs-test.h +++ b/mm/damon/tests/sysfs-kunit.h diff --git a/mm/damon/vaddr-test.h b/mm/damon/tests/vaddr-kunit.h index 83626483f82b..a339d117150f 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -77,7 +77,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, }; - mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); + mt_init_flags(&mm.mm_mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU); if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas))) kunit_skip(test, "Failed to create VMA tree"); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index a0036dc78a3b..08cfd22b5249 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -732,4 +732,4 @@ static int __init damon_va_initcall(void) subsys_initcall(damon_va_initcall); -#include "vaddr-test.h" +#include "tests/vaddr-kunit.h" diff --git a/mm/debug.c b/mm/debug.c index 69e524c3e601..aa57d3ffd4ed 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -36,11 +36,6 @@ const struct trace_print_flags pageflag_names[] = { {0, NULL} }; -const struct trace_print_flags pagetype_names[] = { - __def_pagetype_names, - {0, NULL} -}; - const struct trace_print_flags gfpflag_names[] = { __def_gfpflag_names, {0, NULL} @@ -51,6 +46,27 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; +#define DEF_PAGETYPE_NAME(_name) [PGTY_##_name - 0xf0] = __stringify(_name) + +static const char *page_type_names[] = { + DEF_PAGETYPE_NAME(slab), + DEF_PAGETYPE_NAME(hugetlb), + DEF_PAGETYPE_NAME(offline), + DEF_PAGETYPE_NAME(guard), + DEF_PAGETYPE_NAME(table), + DEF_PAGETYPE_NAME(buddy), + DEF_PAGETYPE_NAME(unaccepted), +}; + +static const char *page_type_name(unsigned int page_type) +{ + unsigned i = (page_type >> 24) - 0xf0; + + if (i >= ARRAY_SIZE(page_type_names)) + return "unknown"; + return page_type_names[i]; +} + static void __dump_folio(struct folio *folio, struct page *page, unsigned long pfn, unsigned long idx) { @@ -58,7 +74,7 @@ static void __dump_folio(struct folio *folio, struct page *page, int mapcount = atomic_read(&page->_mapcount); char *type = ""; - mapcount = page_type_has_type(mapcount) ? 0 : mapcount + 1; + mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1; pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); @@ -92,7 +108,8 @@ static void __dump_folio(struct folio *folio, struct page *page, pr_warn("%sflags: %pGp%s\n", type, &folio->flags, is_migrate_cma_folio(folio, pfn) ? " CMA" : ""); if (page_has_type(&folio->page)) - pr_warn("page_type: %pGt\n", &folio->page.page_type); + pr_warn("page_type: %x(%s)\n", folio->page.page_type >> 24, + page_type_name(folio->page.page_type)); print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e4969fb54da3..bc748f700a9e 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -231,10 +231,10 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) set_pmd_at(args->mm, vaddr, args->pmdp, pmd); flush_dcache_page(page); pmdp_set_wrprotect(args->mm, vaddr, args->pmdp); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(pmd_write(pmd)); pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!pmd_none(pmd)); pmd = pfn_pmd(args->pmd_pfn, args->page_prot); @@ -245,10 +245,10 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pmd_mkwrite(pmd, args->vma); pmd = pmd_mkdirty(pmd); pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!pmd_none(pmd)); pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot)); @@ -256,7 +256,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) set_pmd_at(args->mm, vaddr, args->pmdp, pmd); flush_dcache_page(page); pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(pmd_young(pmd)); /* Clear the pte entries */ @@ -357,12 +357,12 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) set_pud_at(args->mm, vaddr, args->pudp, pud); flush_dcache_page(page); pudp_set_wrprotect(args->mm, vaddr, args->pudp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ pud = pfn_pud(args->pud_pfn, args->page_prot); @@ -374,12 +374,12 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) pud = pud_mkwrite(pud); pud = pud_mkdirty(pud); pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!(pud_write(pud) && pud_dirty(pud))); #ifndef __PAGETABLE_PMD_FOLDED pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ @@ -389,7 +389,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) set_pud_at(args->mm, vaddr, args->pudp, pud); flush_dcache_page(page); pudp_test_and_clear_young(args->vma, vaddr, args->pudp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(pud_young(pud)); pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); @@ -441,7 +441,7 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) WRITE_ONCE(*args->pmdp, __pmd(0)); WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot)); WARN_ON(!pmd_clear_huge(args->pmdp)); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!pmd_none(pmd)); } @@ -461,7 +461,7 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args) WRITE_ONCE(*args->pudp, __pud(0)); WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot)); WARN_ON(!pud_clear_huge(args->pudp)); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!pud_none(pud)); } #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ @@ -490,7 +490,7 @@ static void __init pgd_basic_tests(struct pgtable_debug_args *args) #ifndef __PAGETABLE_PUD_FOLDED static void __init pud_clear_tests(struct pgtable_debug_args *args) { - pud_t pud = READ_ONCE(*args->pudp); + pud_t pud = pudp_get(args->pudp); if (mm_pmd_folded(args->mm)) return; @@ -498,7 +498,7 @@ static void __init pud_clear_tests(struct pgtable_debug_args *args) pr_debug("Validating PUD clear\n"); WARN_ON(pud_none(pud)); pud_clear(args->pudp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(!pud_none(pud)); } @@ -515,7 +515,7 @@ static void __init pud_populate_tests(struct pgtable_debug_args *args) * Hence this must not qualify as pud_bad(). */ pud_populate(args->mm, args->pudp, args->start_pmdp); - pud = READ_ONCE(*args->pudp); + pud = pudp_get(args->pudp); WARN_ON(pud_bad(pud)); } #else /* !__PAGETABLE_PUD_FOLDED */ @@ -526,7 +526,7 @@ static void __init pud_populate_tests(struct pgtable_debug_args *args) { } #ifndef __PAGETABLE_P4D_FOLDED static void __init p4d_clear_tests(struct pgtable_debug_args *args) { - p4d_t p4d = READ_ONCE(*args->p4dp); + p4d_t p4d = p4dp_get(args->p4dp); if (mm_pud_folded(args->mm)) return; @@ -534,7 +534,7 @@ static void __init p4d_clear_tests(struct pgtable_debug_args *args) pr_debug("Validating P4D clear\n"); WARN_ON(p4d_none(p4d)); p4d_clear(args->p4dp); - p4d = READ_ONCE(*args->p4dp); + p4d = p4dp_get(args->p4dp); WARN_ON(!p4d_none(p4d)); } @@ -553,13 +553,13 @@ static void __init p4d_populate_tests(struct pgtable_debug_args *args) pud_clear(args->pudp); p4d_clear(args->p4dp); p4d_populate(args->mm, args->p4dp, args->start_pudp); - p4d = READ_ONCE(*args->p4dp); + p4d = p4dp_get(args->p4dp); WARN_ON(p4d_bad(p4d)); } static void __init pgd_clear_tests(struct pgtable_debug_args *args) { - pgd_t pgd = READ_ONCE(*(args->pgdp)); + pgd_t pgd = pgdp_get(args->pgdp); if (mm_p4d_folded(args->mm)) return; @@ -567,7 +567,7 @@ static void __init pgd_clear_tests(struct pgtable_debug_args *args) pr_debug("Validating PGD clear\n"); WARN_ON(pgd_none(pgd)); pgd_clear(args->pgdp); - pgd = READ_ONCE(*args->pgdp); + pgd = pgdp_get(args->pgdp); WARN_ON(!pgd_none(pgd)); } @@ -586,7 +586,7 @@ static void __init pgd_populate_tests(struct pgtable_debug_args *args) p4d_clear(args->p4dp); pgd_clear(args->pgdp); pgd_populate(args->mm, args->pgdp, args->start_p4dp); - pgd = READ_ONCE(*args->pgdp); + pgd = pgdp_get(args->pgdp); WARN_ON(pgd_bad(pgd)); } #else /* !__PAGETABLE_P4D_FOLDED */ @@ -627,12 +627,12 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) static void __init pmd_clear_tests(struct pgtable_debug_args *args) { - pmd_t pmd = READ_ONCE(*args->pmdp); + pmd_t pmd = pmdp_get(args->pmdp); pr_debug("Validating PMD clear\n"); WARN_ON(pmd_none(pmd)); pmd_clear(args->pmdp); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(!pmd_none(pmd)); } @@ -646,7 +646,7 @@ static void __init pmd_populate_tests(struct pgtable_debug_args *args) * Hence this must not qualify as pmd_bad(). */ pmd_populate(args->mm, args->pmdp, args->start_ptep); - pmd = READ_ONCE(*args->pmdp); + pmd = pmdp_get(args->pmdp); WARN_ON(pmd_bad(pmd)); } @@ -1251,7 +1251,7 @@ static int __init init_args(struct pgtable_debug_args *args) ret = -ENOMEM; goto error; } - args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); + args->start_ptep = pmd_pgtable(pmdp_get(args->pmdp)); WARN_ON(!args->start_ptep); init_fixed_pfns(args); diff --git a/mm/filemap.c b/mm/filemap.c index 65c515e7bbf0..4f3753f0a158 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -46,6 +46,7 @@ #include <linux/pipe_fs_i.h> #include <linux/splice.h> #include <linux/rcupdate_wait.h> +#include <linux/sched/mm.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -112,8 +113,8 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) - * ->lruvec->lru_lock (follow_page->mark_page_accessed) - * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) + * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) + * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) @@ -530,7 +531,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping, struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); - folio_clear_error(folio); } folio_batch_release(&fbatch); cond_resched(); @@ -2049,17 +2049,20 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, if (!folio_batch_add(fbatch, folio)) break; } - rcu_read_unlock(); if (folio_batch_count(fbatch)) { - unsigned long nr = 1; + unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); - *start = indices[idx] + nr; + else + nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); + *start = round_down(indices[idx] + nr, nr); } + rcu_read_unlock(); + return folio_batch_count(fbatch); } @@ -2091,10 +2094,17 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + unsigned long base; + unsigned long nr; + if (!xa_is_value(folio)) { - if (folio->index < *start) + nr = folio_nr_pages(folio); + base = folio->index; + /* Omit large folio which begins before the start */ + if (base < *start) goto put; - if (folio_next_index(folio) - 1 > end) + /* Omit large folio which extends beyond the end */ + if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; @@ -2103,7 +2113,19 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); + } else { + nr = 1 << xas_get_order(&xas); + base = xas.xa_index & ~(nr - 1); + /* Omit order>0 value which begins before the start */ + if (base < *start) + continue; + /* Omit order>0 value which extends beyond the end */ + if (base + nr - 1 > end) + break; } + + /* Update start now so that last update is correct on return */ + *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; @@ -2115,15 +2137,6 @@ put: } rcu_read_unlock(); - if (folio_batch_count(fbatch)) { - unsigned long nr = 1; - int idx = folio_batch_count(fbatch) - 1; - - folio = fbatch->folios[idx]; - if (!xa_is_value(folio)) - nr = folio_nr_pages(folio); - *start = indices[idx] + nr; - } return folio_batch_count(fbatch); } @@ -2344,13 +2357,6 @@ static int filemap_read_folio(struct file *file, filler_t filler, unsigned long pflags; int error; - /* - * A previous I/O error may have been due to temporary failures, - * eg. multipath errors. PG_error will be set again if read_folio - * fails. - */ - folio_clear_error(folio); - /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); @@ -2519,6 +2525,7 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; + unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ @@ -2531,8 +2538,12 @@ retry: if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_NOWAIT) + flags = memalloc_noio_save(); page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); + if (iocb->ki_flags & IOCB_NOWAIT) + memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { @@ -2560,6 +2571,7 @@ retry: goto err; } + trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0; err: if (err < 0) @@ -3000,7 +3012,7 @@ unlock: static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) - return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return PAGE_SIZE << xas_get_order(xas); return folio_size(folio); } @@ -3298,6 +3310,8 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; + trace_mm_filemap_fault(mapping, index); + /* * Do we have something in the page cache already? */ @@ -3668,6 +3682,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); + trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); out: rcu_read_unlock(); @@ -4297,7 +4312,7 @@ static void filemap_cachestat(struct address_space *mapping, if (xas_retry(&xas, folio)) continue; - order = xa_get_order(xas.xa, xas.xa_index); + order = xas_get_order(&xas); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; diff --git a/mm/folio-compat.c b/mm/folio-compat.c index f05906006b3c..80746182e9e8 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -92,15 +92,3 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); - -bool isolate_lru_page(struct page *page) -{ - if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) - return false; - return folio_isolate_lru((struct folio *)page); -} - -void putback_lru_page(struct page *page) -{ - folio_putback_lru(page_folio(page)); -} @@ -832,6 +832,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; + struct folio *folio; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -889,6 +890,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, goto out; } } + folio = page_folio(page); if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); @@ -899,7 +901,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, !PageAnonExclusive(page), page); /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ - ret = try_grab_folio(page_folio(page), 1, flags); + ret = try_grab_folio(folio, 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; @@ -911,7 +913,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { - ret = arch_make_page_accessible(page); + ret = arch_make_folio_accessible(folio); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); @@ -1083,28 +1085,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, return page; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) -{ - struct follow_page_context ctx = { NULL }; - struct page *page; - - if (vma_is_secretmem(vma)) - return NULL; - - if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) - return NULL; - - /* - * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect - * to fail on PROT_NONE-mapped pages. - */ - page = follow_page_mask(vma, address, foll_flags, &ctx); - if (ctx.pgmap) - put_dev_pagemap(ctx.pgmap); - return page; -} - static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) @@ -1166,19 +1146,19 @@ unmap: * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, bool unshare, + unsigned long address, unsigned int flags, bool unshare, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; - if (*flags & FOLL_NOFAULT) + if (flags & FOLL_NOFAULT) return -EFAULT; - if (*flags & FOLL_WRITE) + if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; - if (*flags & FOLL_REMOTE) + if (flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (*flags & FOLL_UNLOCKABLE) { + if (flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set @@ -1186,12 +1166,12 @@ static int faultin_page(struct vm_area_struct *vma, * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ - if (*flags & FOLL_INTERRUPTIBLE) + if (flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } - if (*flags & FOLL_NOWAIT) + if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; - if (*flags & FOLL_TRIED) { + if (flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist @@ -1225,7 +1205,7 @@ static int faultin_page(struct vm_area_struct *vma, } if (ret & VM_FAULT_ERROR) { - int err = vm_fault_to_errno(ret, *flags); + int err = vm_fault_to_errno(ret, flags); if (err) return err; @@ -1450,7 +1430,6 @@ static long __get_user_pages(struct mm_struct *mm, do { struct page *page; - unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ @@ -1501,9 +1480,9 @@ retry: } cond_resched(); - page = follow_page_mask(vma, start, foll_flags, &ctx); + page = follow_page_mask(vma, start, gup_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { - ret = faultin_page(vma, start, &foll_flags, + ret = faultin_page(vma, start, gup_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: @@ -1560,13 +1539,12 @@ next_page: * large folio, this should never fail. */ if (try_grab_folio(folio, page_increm - 1, - foll_flags)) { + gup_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ - gup_put_folio(folio, 1, - foll_flags); + gup_put_folio(folio, 1, gup_flags); ret = -EFAULT; goto out; } @@ -2370,7 +2348,7 @@ static int migrate_longterm_unpinnable_folios( folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); - if (migrate_device_coherent_page(&folio->page)) { + if (migrate_device_coherent_folio(folio)) { ret = -EBUSY; goto err; } @@ -2532,7 +2510,7 @@ static bool is_valid_gup_args(struct page **pages, int *locked, * These flags not allowed to be specified externally to the gup * interfaces: * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only - * - FOLL_REMOTE is internal only and used on follow_page() + * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote() * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) @@ -2934,7 +2912,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, * details. */ if (flags & FOLL_PIN) { - ret = arch_make_page_accessible(page); + ret = arch_make_folio_accessible(folio); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; @@ -3073,6 +3051,9 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pmd_special(orig)) + return 0; + if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; @@ -3117,6 +3098,9 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; + if (pud_special(orig)) + return 0; + if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b71f744d360c..0580ac9e47b9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -40,6 +40,7 @@ #include <linux/memory-tiers.h> #include <linux/compat.h> #include <linux/pgalloc_tag.h> +#include <linux/pagewalk.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -73,6 +74,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc); +static bool split_underused_thp = true; static atomic_t huge_zero_refcount; struct folio *huge_zero_folio __read_mostly; @@ -80,6 +82,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; +static bool anon_orders_configured __initdata; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, @@ -94,8 +97,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) supported_orders = THP_ORDERS_ALL_ANON; - else if (vma_is_dax(vma)) - supported_orders = THP_ORDERS_ALL_FILE_DAX; + else if (vma_is_special_huge(vma)) + supported_orders = THP_ORDERS_ALL_SPECIAL; else supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; @@ -159,15 +162,10 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, * Must be done before hugepage flags check since shmem has its * own flags. */ - if (!in_pf && shmem_file(vma->vm_file)) { - bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, - !enforce_sysfs, vma->vm_mm, vm_flags); - - if (!vma_is_anon_shmem(vma)) - return global_huge ? orders : 0; + if (!in_pf && shmem_file(vma->vm_file)) return shmem_allowable_huge_orders(file_inode(vma->vm_file), - vma, vma->vm_pgoff, global_huge); - } + vma, vma->vm_pgoff, 0, + !enforce_sysfs); if (!vma_is_anonymous(vma)) { /* @@ -445,6 +443,27 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj, static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); +static ssize_t split_underused_thp_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", split_underused_thp); +} + +static ssize_t split_underused_thp_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err = kstrtobool(buf, &split_underused_thp); + + if (err < 0) + return err; + + return count; +} + +static struct kobj_attribute split_underused_thp_attr = __ATTR( + shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store); + static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, @@ -453,6 +472,7 @@ static struct attribute *hugepage_attr[] = { #ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif + &split_underused_thp_attr.attr, NULL, }; @@ -465,8 +485,8 @@ static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); static LIST_HEAD(thpsize_list); -static ssize_t thpsize_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t anon_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; @@ -483,9 +503,9 @@ static ssize_t thpsize_enabled_show(struct kobject *kobj, return sysfs_emit(buf, "%s\n", output); } -static ssize_t thpsize_enabled_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t anon_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; @@ -527,19 +547,35 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj, return ret; } -static struct kobj_attribute thpsize_enabled_attr = - __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store); +static struct kobj_attribute anon_enabled_attr = + __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); + +static struct attribute *anon_ctrl_attrs[] = { + &anon_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group anon_ctrl_attr_grp = { + .attrs = anon_ctrl_attrs, +}; -static struct attribute *thpsize_attrs[] = { - &thpsize_enabled_attr.attr, +static struct attribute *file_ctrl_attrs[] = { #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif NULL, }; -static const struct attribute_group thpsize_attr_group = { - .attrs = thpsize_attrs, +static const struct attribute_group file_ctrl_attr_grp = { + .attrs = file_ctrl_attrs, +}; + +static struct attribute *any_ctrl_attrs[] = { + NULL, +}; + +static const struct attribute_group any_ctrl_attr_grp = { + .attrs = any_ctrl_attrs, }; static const struct kobj_type thpsize_ktype = { @@ -578,64 +614,136 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); +#ifdef CONFIG_SHMEM DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); +#endif DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); +DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); +DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); -static struct attribute *stats_attrs[] = { +static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, +#ifndef CONFIG_SHMEM &swpout_attr.attr, &swpout_fallback_attr.attr, +#endif + &split_deferred_attr.attr, + &nr_anon_attr.attr, + &nr_anon_partially_mapped_attr.attr, + NULL, +}; + +static struct attribute_group anon_stats_attr_grp = { + .name = "stats", + .attrs = anon_stats_attrs, +}; + +static struct attribute *file_stats_attrs[] = { +#ifdef CONFIG_SHMEM &shmem_alloc_attr.attr, &shmem_fallback_attr.attr, &shmem_fallback_charge_attr.attr, +#endif + NULL, +}; + +static struct attribute_group file_stats_attr_grp = { + .name = "stats", + .attrs = file_stats_attrs, +}; + +static struct attribute *any_stats_attrs[] = { +#ifdef CONFIG_SHMEM + &swpout_attr.attr, + &swpout_fallback_attr.attr, +#endif &split_attr.attr, &split_failed_attr.attr, - &split_deferred_attr.attr, NULL, }; -static struct attribute_group stats_attr_group = { +static struct attribute_group any_stats_attr_grp = { .name = "stats", - .attrs = stats_attrs, + .attrs = any_stats_attrs, }; +static int sysfs_add_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + int ret = -ENOENT; + + /* + * If the group is named, try to merge first, assuming the subdirectory + * was already created. This avoids the warning emitted by + * sysfs_create_group() if the directory already exists. + */ + if (grp->name) + ret = sysfs_merge_group(kobj, grp); + if (ret) + ret = sysfs_create_group(kobj, grp); + + return ret; +} + static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; struct thpsize *thpsize; - int ret; + int ret = -ENOMEM; thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); if (!thpsize) - return ERR_PTR(-ENOMEM); + goto err; + + thpsize->order = order; ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, "hugepages-%lukB", size); if (ret) { kfree(thpsize); - return ERR_PTR(ret); + goto err; } - ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group); - if (ret) { - kobject_put(&thpsize->kobj); - return ERR_PTR(ret); + + ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp); + if (ret) + goto err_put; + + if (BIT(order) & THP_ORDERS_ALL_ANON) { + ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp); + if (ret) + goto err_put; } - ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group); - if (ret) { - kobject_put(&thpsize->kobj); - return ERR_PTR(ret); + if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) { + ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp); + if (ret) + goto err_put; + + ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp); + if (ret) + goto err_put; } - thpsize->order = order; return thpsize; +err_put: + kobject_put(&thpsize->kobj); +err: + return ERR_PTR(ret); } static void thpsize_release(struct kobject *kobj) @@ -655,7 +763,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time * constant so we have to do this here. */ - huge_anon_orders_inherit = BIT(PMD_ORDER); + if (!anon_orders_configured) + huge_anon_orders_inherit = BIT(PMD_ORDER); *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { @@ -675,7 +784,7 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) goto remove_hp_group; } - orders = THP_ORDERS_ALL_ANON; + orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT; order = highest_order(orders); while (orders) { thpsize = thpsize_create(order, *hugepage_kobj); @@ -840,6 +949,100 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); +static inline int get_order_from_str(const char *size_str) +{ + unsigned long size; + char *endptr; + int order; + + size = memparse(size_str, &endptr); + + if (!is_power_of_2(size)) + goto err; + order = get_order(size); + if (BIT(order) & ~THP_ORDERS_ALL_ANON) + goto err; + + return order; +err: + pr_err("invalid size %s in thp_anon boot parameter\n", size_str); + return -EINVAL; +} + +static char str_dup[PAGE_SIZE] __initdata; +static int __init setup_thp_anon(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always, inherit, madvise; + char *start_size, *end_size; + int start, end, nr; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strcpy(str_dup, str); + + always = huge_anon_orders_always; + madvise = huge_anon_orders_madvise; + inherit = huge_anon_orders_inherit; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size); + end = get_order_from_str(end_size); + } else { + start = end = get_order_from_str(subtoken); + } + + if (start < 0 || end < 0 || start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + } else if (!strcmp(policy, "madvise")) { + bitmap_set(&madvise, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "inherit")) { + bitmap_set(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "never")) { + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else { + pr_err("invalid policy %s in thp_anon boot parameter\n", policy); + goto err; + } + } + } + + huge_anon_orders_always = always; + huge_anon_orders_madvise = madvise; + huge_anon_orders_inherit = inherit; + anon_orders_configured = true; + return 1; + +err: + pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str); + return 0; +} +__setup("thp_anon=", setup_thp_anon); + pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) @@ -1009,6 +1212,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); + deferred_split_folio(folio, false); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); @@ -1168,6 +1372,8 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); + else + entry = pmd_mkspecial(entry); if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -1251,10 +1457,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { if (write) { - if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { - WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) goto out_unlock; - } entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) @@ -1266,6 +1470,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); + else + entry = pud_mkspecial(entry); if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); @@ -1379,6 +1585,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgtable_t pgtable = NULL; int ret = -ENOMEM; + pmd = pmdp_get_lockless(src_pmd); + if (unlikely(pmd_special(pmd))) { + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + /* + * No need to recheck the pmd, it can't change with write + * mmap lock held here. + * + * Meanwhile, making sure it's not a CoW VMA with writable + * mapping, otherwise it means either the anon page wrongly + * applied special bit, or we made the PRIVATE mapping be + * able to wrongly write to the backend MMIO. + */ + VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); + goto set_pmd; + } + /* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0; @@ -1460,7 +1684,9 @@ out_zero_page: pmdp_set_wrprotect(src_mm, addr, src_pmd); if (!userfaultfd_wp(dst_vma)) pmd = pmd_clear_uffd_wp(pmd); - pmd = pmd_mkold(pmd_wrprotect(pmd)); + pmd = pmd_wrprotect(pmd); +set_pmd: + pmd = pmd_mkold(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; @@ -1503,20 +1729,14 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; /* - * When page table lock is held, the huge zero pud should not be - * under splitting since we don't split the page itself, only pud to - * a page table. - */ - if (is_huge_zero_pud(pud)) { - /* No huge zero pud yet */ - } - - /* * TODO: once we support anonymous pages, use * folio_try_dup_anon_rmap_*() and split if duplicating fails. */ - pudp_set_wrprotect(src_mm, addr, src_pud); - pud = pud_mkold(pud_wrprotect(pud)); + if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_wrprotect(pud); + } + pud = pud_mkold(pud); set_pud_at(dst_mm, addr, dst_pud, pud); ret = 0; @@ -1675,22 +1895,23 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma, vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - pmd_t oldpmd = vmf->orig_pmd; - pmd_t pmd; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int nid = NUMA_NO_NODE; - int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); + int target_nid, last_cpupid; + pmd_t pmd, old_pmd; bool writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + old_pmd = pmdp_get(vmf->pmd); + + if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) { spin_unlock(vmf->ptl); return 0; } - pmd = pmd_modify(oldpmd, vma->vm_page_prot); + pmd = pmd_modify(old_pmd, vma->vm_page_prot); /* * Detect now whether the PMD could be writable; this information @@ -1705,18 +1926,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) if (!folio) goto out_map; - /* See similar comment in do_numa_page for explanation */ - if (!writable) - flags |= TNF_NO_GROUP; - nid = folio_nid(folio); - /* - * For memory tiering mode, cpupid of slow memory page is used - * to record page access time. So use default value. - */ - if (node_is_toptier(nid)) - last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags); + + target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable, + &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { @@ -1736,13 +1949,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATE_FAIL; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) { spin_unlock(vmf->ptl); return 0; } out_map: /* Restore the PMD */ - pmd = pmd_modify(oldpmd, vma->vm_page_prot); + pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (writable) pmd = pmd_mkwrite(pmd, vma); @@ -2064,8 +2277,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, toptier) goto unlock; - if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && - !toptier) + if (folio_use_access_time(folio)) folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); } @@ -2118,6 +2330,53 @@ unlock: return ret; } +/* + * Returns: + * + * - 0: if pud leaf changed from under us + * - 1: if pud can be skipped + * - HPAGE_PUD_NR: if pud was successfully processed + */ +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) +{ + struct mm_struct *mm = vma->vm_mm; + pud_t oldpud, entry; + spinlock_t *ptl; + + tlb_change_page_size(tlb, HPAGE_PUD_SIZE); + + /* NUMA balancing doesn't apply to dax */ + if (cp_flags & MM_CP_PROT_NUMA) + return 1; + + /* + * Huge entries on userfault-wp only works with anonymous, while we + * don't have anonymous PUDs yet. + */ + if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL)) + return 1; + + ptl = __pud_trans_huge_lock(pudp, vma); + if (!ptl) + return 0; + + /* + * Can't clear PUD or it can race with concurrent zapping. See + * change_huge_pmd(). + */ + oldpud = pudp_invalidate(vma, addr, pudp); + entry = pud_modify(oldpud, newprot); + set_pud_at(mm, addr, pudp, entry); + tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE); + + spin_unlock(ptl); + return HPAGE_PUD_NR; +} +#endif + #ifdef CONFIG_USERFAULTFD /* * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by @@ -2297,12 +2556,14 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { spinlock_t *ptl; + pud_t orig_pud; ptl = __pud_trans_huge_lock(pud, vma); if (!ptl) return 0; - pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); + orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); + arch_check_zapped_pud(vma, orig_pud); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { spin_unlock(ptl); @@ -2346,6 +2607,11 @@ out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } +#else +void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ +} #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, @@ -2780,7 +3046,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, return false; } -static void remap_page(struct folio *folio, unsigned long nr) +static void remap_page(struct folio *folio, unsigned long nr, int flags) { int i = 0; @@ -2788,7 +3054,7 @@ static void remap_page(struct folio *folio, unsigned long nr) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, true); + remove_migration_ptes(folio, folio, RMP_LOCKED | flags); i += folio_nr_pages(folio); if (i >= nr) break; @@ -2796,25 +3062,25 @@ static void remap_page(struct folio *folio, unsigned long nr) } } -static void lru_add_page_tail(struct page *head, struct page *tail, +static void lru_add_page_tail(struct folio *folio, struct page *tail, struct lruvec *lruvec, struct list_head *list) { - VM_BUG_ON_PAGE(!PageHead(head), head); - VM_BUG_ON_PAGE(PageLRU(tail), head); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + VM_BUG_ON_FOLIO(PageLRU(tail), folio); lockdep_assert_held(&lruvec->lru_lock); if (list) { /* page reclaim is reclaiming a huge page */ - VM_WARN_ON(PageLRU(head)); + VM_WARN_ON(folio_test_lru(folio)); get_page(tail); list_add_tail(&tail->lru, list); } else { /* head is still on lru (and we have it frozen) */ - VM_WARN_ON(!PageLRU(head)); - if (PageUnevictable(tail)) + VM_WARN_ON(!folio_test_lru(folio)); + if (folio_test_unevictable(folio)) tail->mlock_count = 0; else - list_add_tail(&tail->lru, &head->lru); + list_add_tail(&tail->lru, &folio->lru); SetPageLRU(tail); } } @@ -2857,8 +3123,10 @@ static void __split_huge_page_tail(struct folio *folio, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 (1L << PG_arch_2) | +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 (1L << PG_arch_3) | #endif (1L << PG_dirty) | @@ -2913,7 +3181,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, * pages to show after the currently processed elements - e.g. * migrate_pages */ - lru_add_page_tail(head, page_tail, lruvec, list); + lru_add_page_tail(folio, page_tail, lruvec, list); } static void __split_huge_page(struct page *page, struct list_head *list, @@ -2976,7 +3244,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, order, new_order); - pgalloc_tag_split(head, 1 << order); + pgalloc_tag_split(folio, order, new_order); /* See comment in __split_huge_page_tail() */ if (folio_test_anon(folio)) { @@ -2996,7 +3264,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (nr_dropped) shmem_uncharge(folio->mapping->host, nr_dropped); - remap_page(folio, nr); + remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0); /* * set page to its compound_head when split to non order-0 pages, so @@ -3025,7 +3293,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } /* Racy check whether the huge page can be split */ -bool can_split_folio(struct folio *folio, int *pextra_pins) +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) { int extra_pins; @@ -3037,7 +3305,8 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) extra_pins = folio_nr_pages(folio); if (pextra_pins) *pextra_pins = extra_pins; - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; + return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - + caller_pins; } /* @@ -3094,8 +3363,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, struct deferred_split *ds_queue = get_deferred_split_queue(folio); /* reset xarray order to new order after split */ XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); - struct anon_vma *anon_vma = NULL; + bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; + struct anon_vma *anon_vma = NULL; int order = folio_order(folio); int extra_pins, ret; pgoff_t end; @@ -3107,7 +3377,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (new_order >= folio_order(folio)) return -EINVAL; - if (folio_test_anon(folio)) { + if (is_anon) { /* order-1 is not supported for anonymous THP. */ if (new_order == 1) { VM_WARN_ONCE(1, "Cannot split to order-1 folio"); @@ -3147,7 +3417,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (folio_test_writeback(folio)) return -EBUSY; - if (folio_test_anon(folio)) { + if (is_anon) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -3217,7 +3487,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, * Racy check if we can split the page, before unmap_folio() will * split PMDs */ - if (!can_split_folio(folio, &extra_pins)) { + if (!can_split_folio(folio, 1, &extra_pins)) { ret = -EAGAIN; goto out_unlock; } @@ -3243,6 +3513,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + if (folio_test_partially_mapped(folio)) { + __folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } /* * Reinitialize page_deferred_list after removing the * page from the split_queue, otherwise a subsequent @@ -3269,6 +3544,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, } } + if (is_anon) { + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); + mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order)); + } __split_huge_page(page, list, end, new_order); ret = 0; } else { @@ -3277,7 +3556,7 @@ fail: if (mapping) xas_unlock(&xas); local_irq_enable(); - remap_page(folio, folio_nr_pages(folio)); + remap_page(folio, folio_nr_pages(folio), 0); ret = -EAGAIN; } @@ -3329,12 +3608,18 @@ void __folio_undo_large_rmappable(struct folio *folio) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; + if (folio_test_partially_mapped(folio)) { + __folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } list_del_init(&folio->_deferred_list); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } -void deferred_split_folio(struct folio *folio) +/* partially_mapped=false won't clear PG_partially_mapped folio flag */ +void deferred_split_folio(struct folio *folio, bool partially_mapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG @@ -3349,6 +3634,9 @@ void deferred_split_folio(struct folio *folio) if (folio_order(folio) <= 1) return; + if (!partially_mapped && !split_underused_thp) + return; + /* * The try_to_unmap() in page reclaim path might reach here too, * this may cause a race condition to corrupt deferred split queue. @@ -3362,14 +3650,21 @@ void deferred_split_folio(struct folio *folio) if (folio_test_swapcache(folio)) return; - if (!list_empty(&folio->_deferred_list)) - return; - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (partially_mapped) { + if (!folio_test_partially_mapped(folio)) { + __folio_set_partially_mapped(folio); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_DEFERRED_SPLIT_PAGE); + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); + + } + } else { + /* partially mapped folios cannot become non-partially mapped */ + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); + } if (list_empty(&folio->_deferred_list)) { - if (folio_test_pmd_mappable(folio)) - count_vm_event(THP_DEFERRED_SPLIT_PAGE); - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG @@ -3394,6 +3689,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink, return READ_ONCE(ds_queue->split_queue_len); } +static bool thp_underused(struct folio *folio) +{ + int num_zero_pages = 0, num_filled_pages = 0; + void *kaddr; + int i; + + if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) + return false; + + for (i = 0; i < folio_nr_pages(folio); i++) { + kaddr = kmap_local_folio(folio, i * PAGE_SIZE); + if (!memchr_inv(kaddr, 0, PAGE_SIZE)) { + num_zero_pages++; + if (num_zero_pages > khugepaged_max_ptes_none) { + kunmap_local(kaddr); + return true; + } + } else { + /* + * Another path for early exit once the number + * of non-zero filled pages exceeds threshold. + */ + num_filled_pages++; + if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) { + kunmap_local(kaddr); + return false; + } + } + kunmap_local(kaddr); + } + return false; +} + static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { @@ -3417,6 +3745,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, list_move(&folio->_deferred_list, &list); } else { /* We lost race with folio_put() */ + if (folio_test_partially_mapped(folio)) { + __folio_clear_partially_mapped(folio); + mod_mthp_stat(folio_order(folio), + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); + } list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } @@ -3426,13 +3759,35 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_entry_safe(folio, next, &list, _deferred_list) { + bool did_split = false; + bool underused = false; + + if (!folio_test_partially_mapped(folio)) { + underused = thp_underused(folio); + if (!underused) + goto next; + } if (!folio_trylock(folio)) goto next; - /* split_huge_page() removes page from list on success */ - if (!split_folio(folio)) + if (!split_folio(folio)) { + did_split = true; + if (underused) + count_vm_event(THP_UNDERUSED_SPLIT_PAGE); split++; + } folio_unlock(folio); next: + /* + * split_folio() removes folio from list on success. + * Only add back to the queue if folio is partially mapped. + * If thp_underused returns false, or if split_folio fails + * in the case it was underused, then consider it used and + * don't add it back to split_queue. + */ + if (!did_split && !folio_test_partially_mapped(folio)) { + list_del_init(&folio->_deferred_list); + ds_queue->split_queue_len--; + } folio_put(folio); } @@ -3518,16 +3873,11 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, vaddr_start &= PAGE_MASK; vaddr_end &= PAGE_MASK; - /* Find the task_struct from pid */ - rcu_read_lock(); - task = find_task_by_vpid(pid); + task = find_get_task_by_vpid(pid); if (!task) { - rcu_read_unlock(); ret = -ESRCH; goto out; } - get_task_struct(task); - rcu_read_unlock(); /* Find the mm_struct */ mm = get_task_mm(task); @@ -3548,7 +3898,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, */ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { struct vm_area_struct *vma = vma_lookup(mm, addr); - struct page *page; + struct folio_walk fw; struct folio *folio; struct address_space *mapping; unsigned int target_order = new_order; @@ -3562,13 +3912,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, continue; } - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - if (IS_ERR_OR_NULL(page)) + folio = folio_walk_start(&fw, vma, addr, 0); + if (!folio) continue; - folio = page_folio(page); if (!is_transparent_hugepage(folio)) goto next; @@ -3588,11 +3935,13 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, * can be split or not. So skip the check here. */ if (!folio_test_private(folio) && - !can_split_folio(folio, NULL)) + !can_split_folio(folio, 0, NULL)) goto next; if (!folio_trylock(folio)) goto next; + folio_get(folio); + folio_walk_end(&fw, vma); if (!folio_test_anon(folio) && folio->mapping != mapping) goto unlock; @@ -3603,8 +3952,12 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, unlock: folio_unlock(folio); -next: folio_put(folio); + + cond_resched(); + continue; +next: + folio_walk_end(&fw, vma); cond_resched(); } mmap_read_unlock(mm); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a3a6e2dee97..def84d8bcf2d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -56,16 +56,6 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) -{ - return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, - 1 << order); -} -#else -static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) -{ - return false; -} #endif static unsigned long hugetlb_cma_size __initdata; @@ -82,14 +72,14 @@ static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. */ -DEFINE_SPINLOCK(hugetlb_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock); /* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized. */ -static int num_fault_mutexes; -struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; +static int num_fault_mutexes __ro_after_init; +struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -100,6 +90,17 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); +static void hugetlb_free_folio(struct folio *folio) +{ +#ifdef CONFIG_CMA + int nid = folio_nid(folio); + + if (cma_free_folio(hugetlb_cma[nid], folio)) + return; +#endif + folio_put(folio); +} + static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -1512,95 +1513,54 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -/* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i; - int nr_pages = 1 << order; - struct page *p; - - atomic_set(&folio->_entire_mapcount, 0); - atomic_set(&folio->_large_mapcount, 0); - atomic_set(&folio->_pincount, 0); - - for (i = 1; i < nr_pages; i++) { - p = folio_page(folio, i); - p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; - p->mapping = NULL; - clear_compound_head(p); - if (!demote) - set_page_refcounted(p); - } - - __folio_clear_head(folio); -} - -static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, true); -} - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, false); -} - -static void free_gigantic_folio(struct folio *folio, unsigned int order) -{ - /* - * If the page isn't allocated using the cma allocator, - * cma_release() returns false. - */ -#ifdef CONFIG_CMA - int nid = folio_nid(folio); - - if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) - return; -#endif - - free_contig_range(folio_pfn(folio), 1 << order); -} - #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - struct page *page; - unsigned long nr_pages = pages_per_huge_page(h); + struct folio *folio; + int order = huge_page_order(h); + bool retried = false; + if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - +retry: + folio = NULL; #ifdef CONFIG_CMA { int node; - if (hugetlb_cma[nid]) { - page = cma_alloc(hugetlb_cma[nid], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); - } + if (hugetlb_cma[nid]) + folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); - if (!(gfp_mask & __GFP_THISNODE)) { + if (!folio && !(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; - page = cma_alloc(hugetlb_cma[node], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); + folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); + if (folio) + break; } } } #endif + if (!folio) { + folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); + if (!folio) + return NULL; + } - page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); - return page ? page_folio(page) : NULL; + if (folio_ref_freeze(folio, 1)) + return folio; + + pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); + hugetlb_free_folio(folio); + if (!retried) { + retried = true; + goto retry; + } + return NULL; } #else /* !CONFIG_CONTIG_ALLOC */ @@ -1617,10 +1577,6 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, { return NULL; } -static inline void free_gigantic_folio(struct folio *folio, - unsigned int order) { } -static inline void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) { } #endif /* @@ -1748,18 +1704,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, folio_ref_unfreeze(folio, 1); - /* - * Non-gigantic pages demoted from CMA allocated gigantic pages - * need to be given back to CMA in free_gigantic_folio. - */ - if (hstate_is_gigantic(h) || - hugetlb_cma_folio(folio, huge_page_order(h))) { - destroy_compound_gigantic_folio(folio, huge_page_order(h)); - free_gigantic_folio(folio, huge_page_order(h)); - } else { - INIT_LIST_HEAD(&folio->_deferred_list); - folio_put(folio); - } + INIT_LIST_HEAD(&folio->_deferred_list); + hugetlb_free_folio(folio); } /* @@ -2032,95 +1978,6 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i, j; - int nr_pages = 1 << order; - struct page *p; - - __folio_clear_reserved(folio); - for (i = 0; i < nr_pages; i++) { - p = folio_page(folio, i); - - /* - * For gigantic hugepages allocated through bootmem at - * boot, it's safer to be consistent with the not-gigantic - * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwise drivers using get_user_pages() to access tail - * pages may get the reference counting wrong if they see - * PG_reserved set on a tail page (despite the head page not - * having PG_reserved set). Enforcing this consistency between - * head and tail pages allows drivers to optimize away a check - * on the head page when they need know if put_page() is needed - * after get_user_pages(). - */ - if (i != 0) /* head page cleared above */ - __ClearPageReserved(p); - /* - * Subtle and very unlikely - * - * Gigantic 'page allocators' such as memblock or cma will - * return a set of pages with each page ref counted. We need - * to turn this set of pages into a compound page with tail - * page ref counts set to zero. Code such as speculative page - * cache adding could take a ref on a 'to be' tail page. - * We need to respect any increased ref count, and only set - * the ref count to zero if count is currently 1. If count - * is not 1, we return an error. An error return indicates - * the set of pages can not be converted to a gigantic page. - * The caller who allocated the pages should then discard the - * pages using the appropriate free interface. - * - * In the case of demote, the ref count will be zero. - */ - if (!demote) { - if (!page_ref_freeze(p, 1)) { - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); - goto out_error; - } - } else { - VM_BUG_ON_PAGE(page_count(p), p); - } - if (i != 0) - set_compound_head(p, &folio->page); - } - __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */ - folio_set_order(folio, order); - atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_large_mapcount, -1); - atomic_set(&folio->_pincount, 0); - return true; - -out_error: - /* undo page modifications made above */ - for (j = 0; j < i; j++) { - p = folio_page(folio, j); - if (j != 0) - clear_compound_head(p); - set_page_refcounted(p); - } - /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++) { - p = folio_page(folio, j); - __ClearPageReserved(p); - } - return false; -} - -static bool prep_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, false); -} - -static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, true); -} - /* * Find and lock address space (mapping) in write mode. * @@ -2159,7 +2016,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; - gfp_mask |= __GFP_COMP|__GFP_NOWARN; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) @@ -2206,48 +2062,16 @@ retry: return folio; } -static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) -{ - struct folio *folio; - bool retry = false; - -retry: - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); - else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, - nid, nmask, node_alloc_noretry); - if (!folio) - return NULL; - - if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { - /* - * Rare failure to convert pages to compound page. - * Free pages and try again - ONCE! - */ - free_gigantic_folio(folio, huge_page_order(h)); - if (!retry) { - retry = true; - goto retry; - } - return NULL; - } - } - - return folio; -} - static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, - node_alloc_noretry); + if (hstate_is_gigantic(h)) + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + else + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; @@ -2265,7 +2089,10 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (hstate_is_gigantic(h)) + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + else + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; @@ -2549,9 +2376,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - gfp_t gfp = gfp_mask | __GFP_NOWARN; + gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ @@ -3333,6 +3159,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); + __ClearPageReserved(folio_page(folio, pfn - head_pfn)); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); @@ -3921,101 +3748,120 @@ out: return 0; } -static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) +static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, + struct list_head *src_list) { - int i, nid = folio_nid(folio); - struct hstate *target_hstate; - struct page *subpage; - struct folio *inner_folio; - int rc = 0; - - target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - - remove_hugetlb_folio(h, folio, false); - spin_unlock_irq(&hugetlb_lock); + long rc; + struct folio *folio, *next; + LIST_HEAD(dst_list); + LIST_HEAD(ret_list); - /* - * If vmemmap already existed for folio, the remove routine above would - * have cleared the hugetlb folio flag. Hence the folio is technically - * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be - * passed hugetlb folios and will BUG otherwise. - */ - if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore_folio(h, folio); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); - return rc; - } - } - - /* - * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count folios. - */ - destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); + rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list); + list_splice_init(&ret_list, src_list); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. * Without the mutex, pages added to target hstate could be marked * as surplus. * - * Note that we already hold h->resize_lock. To prevent deadlock, + * Note that we already hold src->resize_lock. To prevent deadlock, * use the convention of always taking larger size hstate mutex first. */ - mutex_lock(&target_hstate->resize_lock); - for (i = 0; i < pages_per_huge_page(h); - i += pages_per_huge_page(target_hstate)) { - subpage = folio_page(folio, i); - inner_folio = page_folio(subpage); - if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(inner_folio, - target_hstate->order); - else - prep_compound_page(subpage, target_hstate->order); - folio_change_private(inner_folio, NULL); - prep_new_hugetlb_folio(target_hstate, inner_folio, nid); - free_huge_folio(inner_folio); + mutex_lock(&dst->resize_lock); + + list_for_each_entry_safe(folio, next, src_list, lru) { + int i; + + if (folio_test_hugetlb_vmemmap_optimized(folio)) + continue; + + list_del(&folio->lru); + + split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); + pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst)); + + for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { + struct page *page = folio_page(folio, i); + + page->mapping = NULL; + clear_compound_head(page); + prep_compound_page(page, dst->order); + + init_new_hugetlb_folio(dst, page_folio(page)); + list_add(&page->lru, &dst_list); + } } - mutex_unlock(&target_hstate->resize_lock); - spin_lock_irq(&hugetlb_lock); + prep_and_add_allocated_folios(dst, &dst_list); - /* - * Not absolutely necessary, but for consistency update max_huge_pages - * based on pool changes for the demoted page. - */ - h->max_huge_pages--; - target_hstate->max_huge_pages += - pages_per_huge_page(h) / pages_per_huge_page(target_hstate); + mutex_unlock(&dst->resize_lock); return rc; } -static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, + unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; - struct folio *folio; + struct hstate *dst; + long rc = 0; + long nr_demoted = 0; lockdep_assert_held(&hugetlb_lock); /* We should never get here if no demote order */ - if (!h->demote_order) { + if (!src->demote_order) { pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); return -EINVAL; /* internal error */ } + dst = size_to_hstate(PAGE_SIZE << src->demote_order); - for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { + for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) { + LIST_HEAD(list); + struct folio *folio, *next; + + list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) { if (folio_test_hwpoison(folio)) continue; - return demote_free_hugetlb_folio(h, folio); + + remove_hugetlb_folio(src, folio, false); + list_add(&folio->lru, &list); + + if (++nr_demoted == nr_to_demote) + break; + } + + spin_unlock_irq(&hugetlb_lock); + + rc = demote_free_hugetlb_folios(src, dst, &list); + + spin_lock_irq(&hugetlb_lock); + + list_for_each_entry_safe(folio, next, &list, lru) { + list_del(&folio->lru); + add_hugetlb_folio(src, folio, false); + + nr_demoted--; } + + if (rc < 0 || nr_demoted == nr_to_demote) + break; } /* + * Not absolutely necessary, but for consistency update max_huge_pages + * based on pool changes for the demoted page. + */ + src->max_huge_pages -= nr_demoted; + dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst)); + + if (rc < 0) + return rc; + + if (nr_demoted) + return nr_demoted; + /* * Only way to get here is if all pages on free lists are poisoned. * Return -EBUSY so that caller will not retry. */ @@ -4249,6 +4095,8 @@ static ssize_t demote_store(struct kobject *kobj, spin_lock_irq(&hugetlb_lock); while (nr_demote) { + long rc; + /* * Check for available pages to demote each time thorough the * loop as demote_pool_huge_page will drop hugetlb_lock. @@ -4261,11 +4109,13 @@ static ssize_t demote_store(struct kobject *kobj, if (!nr_available) break; - err = demote_pool_huge_page(h, n_mask); - if (err) + rc = demote_pool_huge_page(h, n_mask, nr_demote); + if (rc < 0) { + err = rc; break; + } - nr_demote--; + nr_demote -= rc; } spin_unlock_irq(&hugetlb_lock); @@ -7227,7 +7077,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, return 0; } -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, unsigned long addr, pgoff_t idx) @@ -7389,7 +7239,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } -#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -7412,7 +7262,7 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { return false; } -#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -7510,7 +7360,7 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING if (huge_page_size(h) == PMD_SIZE) return PUD_SIZE - PMD_SIZE; #endif @@ -7519,10 +7369,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -/* - * These functions are overwritable if your architecture needs its own - * behavior. - */ bool isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 4ff238ba1250..e716c4671a15 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -114,10 +114,10 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, } page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), - fault_parent); + fault_parent, false); page_counter_init( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), - rsvd_parent); + rsvd_parent, false); limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 0c3f56b3578e..57b7f591eee8 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -43,6 +43,8 @@ struct vmemmap_remap_walk { #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) +/* synchronize_rcu() to avoid writes from page_ref_add_unless() */ +#define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; }; @@ -457,6 +459,9 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; + if (flags & VMEMMAP_SYNCHRONIZE_RCU) + synchronize_rcu(); + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; @@ -489,10 +494,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { - /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ - synchronize_rcu(); - - return __hugetlb_vmemmap_restore_folio(h, folio, 0); + return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); } /** @@ -515,14 +517,14 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct folio *folio, *t_folio; long restored = 0; long ret = 0; - - /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ - synchronize_rcu(); + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { - ret = __hugetlb_vmemmap_restore_folio(h, folio, - VMEMMAP_REMAP_NO_TLB_FLUSH); + ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); + /* only need to synchronize_rcu() once for each batch */ + flags &= ~VMEMMAP_SYNCHRONIZE_RCU; + if (ret) break; restored++; @@ -570,6 +572,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); + + if (flags & VMEMMAP_SYNCHRONIZE_RCU) + synchronize_rcu(); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed @@ -617,10 +622,7 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); - /* avoid writes from page_ref_add_unless() while folding vmemmap */ - synchronize_rcu(); - - __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); free_vmemmap_page_list(&vmemmap_pages); } @@ -647,6 +649,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l { struct folio *folio; LIST_HEAD(vmemmap_pages); + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; list_for_each_entry(folio, folio_list, lru) { int ret = hugetlb_vmemmap_split_folio(h, folio); @@ -663,14 +666,12 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); - /* avoid writes from page_ref_add_unless() while folding vmemmap */ - synchronize_rcu(); - list_for_each_entry(folio, folio_list, lru) { int ret; - ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, - VMEMMAP_REMAP_NO_TLB_FLUSH); + ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); + /* only need to synchronize_rcu() once for each batch */ + flags &= ~VMEMMAP_SYNCHRONIZE_RCU; /* * Pages to be freed may have been accumulated. If we @@ -684,8 +685,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, - VMEMMAP_REMAP_NO_TLB_FLUSH); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); } } diff --git a/mm/internal.h b/mm/internal.h index a963f67d3452..93083bbeeefa 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -8,13 +8,19 @@ #define __MM_INTERNAL_H #include <linux/fs.h> +#include <linux/khugepaged.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> +/* Internal core VMA manipulation functions. */ +#include "vma.h" + struct folio_batch; /* @@ -270,18 +276,22 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; + swp_entry_t entry = pte_to_swp_entry(pte); pte_t *ptep = start_ptep + 1; + unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); - VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte))); + VM_WARN_ON(non_swap_entry(entry)); + cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; - + if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) + break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } @@ -415,9 +425,7 @@ extern unsigned long highest_memmap_pfn; /* * in mm/vmscan.c: */ -bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); -void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); @@ -787,37 +795,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype) return list_empty(&area->free_list[migratetype]); } -/* - * These three helpers classifies VMAs for virtual memory accounting. - */ - -/* - * Executable code area - executable, not writable, not stack - */ -static inline bool is_exec_mapping(vm_flags_t flags) -{ - return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; -} - -/* - * Stack area (including shadow stacks) - * - * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: - * do_mmap() forbids all other combinations. - */ -static inline bool is_stack_mapping(vm_flags_t flags) -{ - return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); -} - -/* - * Data area - private, writable, not stack - */ -static inline bool is_data_mapping(vm_flags_t flags) -{ - return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; -} - /* mm/util.c */ struct anon_vma *folio_anon_vma(struct folio *folio); @@ -1078,6 +1055,8 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) /* * mm/memory-failure.c */ +#ifdef CONFIG_MEMORY_FAILURE +void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); @@ -1098,6 +1077,12 @@ void add_to_kill_ksm(struct task_struct *tsk, struct page *p, unsigned long ksm_addr); unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); +#else +static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +{ +} +#endif + extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); @@ -1174,7 +1159,6 @@ static inline void flush_tlb_batched_pending(struct mm_struct *mm) #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; -extern const struct trace_print_flags pagetype_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; @@ -1226,11 +1210,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, - unsigned long addr, int page_nid, int *flags); +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, + unsigned long addr, int *flags, bool writable, + int *last_cpupid); void free_zone_device_folio(struct folio *folio); -int migrate_device_coherent_page(struct page *page); +int migrate_device_coherent_folio(struct folio *folio); /* * mm/gup.c @@ -1246,13 +1231,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); -/* - * mm/mmap.c - */ -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); - enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, @@ -1379,117 +1357,6 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); } -static inline void vma_iter_config(struct vma_iterator *vmi, - unsigned long index, unsigned long last) -{ - __mas_set_range(&vmi->mas, index, last - 1); -} - -static inline void vma_iter_reset(struct vma_iterator *vmi) -{ - mas_reset(&vmi->mas); -} - -static inline -struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) -{ - return mas_prev_range(&vmi->mas, min); -} - -static inline -struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) -{ - return mas_next_range(&vmi->mas, max); -} - -static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, - unsigned long max, unsigned long size) -{ - return mas_empty_area(&vmi->mas, min, max - 1, size); -} - -static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, - unsigned long max, unsigned long size) -{ - return mas_empty_area_rev(&vmi->mas, min, max - 1, size); -} - -/* - * VMA Iterator functions shared between nommu and mmap - */ -static inline int vma_iter_prealloc(struct vma_iterator *vmi, - struct vm_area_struct *vma) -{ - return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); -} - -static inline void vma_iter_clear(struct vma_iterator *vmi) -{ - mas_store_prealloc(&vmi->mas, NULL); -} - -static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) -{ - return mas_walk(&vmi->mas); -} - -/* Store a VMA with preallocated memory */ -static inline void vma_iter_store(struct vma_iterator *vmi, - struct vm_area_struct *vma) -{ - -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && - vmi->mas.index > vma->vm_start)) { - pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", - vmi->mas.index, vma->vm_start, vma->vm_start, - vma->vm_end, vmi->mas.index, vmi->mas.last); - } - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && - vmi->mas.last < vma->vm_start)) { - pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", - vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, - vmi->mas.index, vmi->mas.last); - } -#endif - - if (vmi->mas.status != ma_start && - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) - vma_iter_invalidate(vmi); - - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(&vmi->mas, vma); -} - -static inline int vma_iter_store_gfp(struct vma_iterator *vmi, - struct vm_area_struct *vma, gfp_t gfp) -{ - if (vmi->mas.status != ma_start && - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) - vma_iter_invalidate(vmi); - - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); - mas_store_gfp(&vmi->mas, vma, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - -/* - * VMA lock generalization - */ -struct vma_prepare { - struct vm_area_struct *vma; - struct vm_area_struct *adj_next; - struct file *file; - struct address_space *mapping; - struct anon_vma *anon_vma; - struct vm_area_struct *insert; - struct vm_area_struct *remove; - struct vm_area_struct *remove2; -}; - void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); @@ -1506,27 +1373,11 @@ static inline int can_do_mseal(unsigned long flags) return 0; } -bool can_modify_mm(struct mm_struct *mm, unsigned long start, - unsigned long end); -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, - unsigned long end, int behavior); #else static inline int can_do_mseal(unsigned long flags) { return -EPERM; } - -static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - return true; -} - -static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, - unsigned long end, int behavior) -{ - return true; -} #endif #ifdef CONFIG_SHRINKER_DEBUG @@ -1578,13 +1429,18 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; -struct unlink_vma_file_batch { - int count; - struct vm_area_struct *vmas[8]; -}; +/* mremap.c */ +unsigned long move_page_tables(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len, + bool need_rmap_locks, bool for_stack); -void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); -void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); -void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); +#ifdef CONFIG_UNACCEPTED_MEMORY +void accept_page(struct page *page); +#else /* CONFIG_UNACCEPTED_MEMORY */ +static inline void accept_page(struct page *page) +{ +} +#endif /* CONFIG_UNACCEPTED_MEMORY */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/kfence/core.c b/mm/kfence/core.c index c5cb54fc696d..67fc321db79b 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -99,6 +99,10 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte static unsigned long kfence_skip_covered_thresh __read_mostly = 75; module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); +/* Allocation burst count: number of excess KFENCE allocations per sample. */ +static unsigned int kfence_burst __read_mostly; +module_param_named(burst, kfence_burst, uint, 0644); + /* If true, use a deferrable timer. */ static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); module_param_named(deferrable, kfence_deferrable, bool, 0444); @@ -269,6 +273,13 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m return pageaddr; } +static inline bool kfence_obj_allocated(const struct kfence_metadata *meta) +{ + enum kfence_object_state state = READ_ONCE(meta->state); + + return state == KFENCE_OBJECT_ALLOCATED || state == KFENCE_OBJECT_RCU_FREEING; +} + /* * Update the object's metadata state, including updating the alloc/free stacks * depending on the state transition. @@ -278,10 +289,14 @@ metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state nex unsigned long *stack_entries, size_t num_stack_entries) { struct kfence_track *track = - next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; + next == KFENCE_OBJECT_ALLOCATED ? &meta->alloc_track : &meta->free_track; lockdep_assert_held(&meta->lock); + /* Stack has been saved when calling rcu, skip. */ + if (READ_ONCE(meta->state) == KFENCE_OBJECT_RCU_FREEING) + goto out; + if (stack_entries) { memcpy(track->stack_entries, stack_entries, num_stack_entries * sizeof(stack_entries[0])); @@ -297,6 +312,7 @@ metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state nex track->cpu = raw_smp_processor_id(); track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ +out: /* * Pairs with READ_ONCE() in * kfence_shutdown_cache(), @@ -502,7 +518,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z raw_spin_lock_irqsave(&meta->lock, flags); - if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { + if (!kfence_obj_allocated(meta) || meta->addr != (unsigned long)addr) { /* Invalid or double-free, bail out. */ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); kfence_report_error((unsigned long)addr, false, NULL, meta, @@ -780,7 +796,7 @@ static void kfence_check_all_canary(void) for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { struct kfence_metadata *meta = &kfence_metadata[i]; - if (meta->state == KFENCE_OBJECT_ALLOCATED) + if (kfence_obj_allocated(meta)) check_canary(meta); } } @@ -827,12 +843,12 @@ static void toggle_allocation_gate(struct work_struct *work) if (!READ_ONCE(kfence_enabled)) return; - atomic_set(&kfence_allocation_gate, 0); + atomic_set(&kfence_allocation_gate, -kfence_burst); #ifdef CONFIG_KFENCE_STATIC_KEYS /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); + wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); @@ -1006,12 +1022,11 @@ void kfence_shutdown_cache(struct kmem_cache *s) * the lock will not help, as different critical section * serialization will have the same outcome. */ - if (READ_ONCE(meta->cache) != s || - READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) + if (READ_ONCE(meta->cache) != s || !kfence_obj_allocated(meta)) continue; raw_spin_lock_irqsave(&meta->lock, flags); - in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; + in_use = meta->cache == s && kfence_obj_allocated(meta); raw_spin_unlock_irqrestore(&meta->lock, flags); if (in_use) { @@ -1052,6 +1067,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) unsigned long stack_entries[KFENCE_STACK_DEPTH]; size_t num_stack_entries; u32 alloc_stack_hash; + int allocation_gate; /* * Perform size check before switching kfence_allocation_gate, so that @@ -1080,14 +1096,15 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) if (s->flags & SLAB_SKIP_KFENCE) return NULL; - if (atomic_inc_return(&kfence_allocation_gate) > 1) + allocation_gate = atomic_inc_return(&kfence_allocation_gate); + if (allocation_gate > 1) return NULL; #ifdef CONFIG_KFENCE_STATIC_KEYS /* * waitqueue_active() is fully ordered after the update of * kfence_allocation_gate per atomic_inc_return(). */ - if (waitqueue_active(&allocation_wait)) { + if (allocation_gate == 1 && waitqueue_active(&allocation_wait)) { /* * Calling wake_up() here may deadlock when allocations happen * from within timer code. Use an irq_work to defer it. @@ -1154,11 +1171,19 @@ void __kfence_free(void *addr) * the object, as the object page may be recycled for other-typed * objects once it has been freed. meta->cache may be NULL if the cache * was destroyed. + * Save the stack trace here so that reports show where the user freed + * the object. */ - if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) + if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) { + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + metadata_update_state(meta, KFENCE_OBJECT_RCU_FREEING, NULL, 0); + raw_spin_unlock_irqrestore(&meta->lock, flags); call_rcu(&meta->rcu_head, rcu_guarded_free); - else + } else { kfence_guarded_free(addr, meta, false); + } } bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) @@ -1182,14 +1207,14 @@ bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs int distance = 0; meta = addr_to_metadata(addr - PAGE_SIZE); - if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + if (meta && kfence_obj_allocated(meta)) { to_report = meta; /* Data race ok; distance calculation approximate. */ distance = addr - data_race(meta->addr + meta->size); } meta = addr_to_metadata(addr + PAGE_SIZE); - if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + if (meta && kfence_obj_allocated(meta)) { /* Data race ok; distance calculation approximate. */ if (!to_report || distance > data_race(meta->addr) - addr) to_report = meta; diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index db87a05047bd..dfba5ea06b01 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -38,6 +38,7 @@ enum kfence_object_state { KFENCE_OBJECT_UNUSED, /* Object is unused. */ KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ + KFENCE_OBJECT_RCU_FREEING, /* Object was allocated, and then being freed by rcu. */ KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ }; diff --git a/mm/kfence/report.c b/mm/kfence/report.c index c509aed326ce..451991a3a8f2 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -16,6 +16,7 @@ #include <linux/sprintf.h> #include <linux/stacktrace.h> #include <linux/string.h> +#include <linux/sched/clock.h> #include <trace/events/error_report.h> #include <asm/kfence.h> @@ -108,11 +109,15 @@ static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadat const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track; u64 ts_sec = track->ts_nsec; unsigned long rem_nsec = do_div(ts_sec, NSEC_PER_SEC); + u64 interval_nsec = local_clock() - meta->alloc_track.ts_nsec; + unsigned long rem_interval_nsec = do_div(interval_nsec, NSEC_PER_SEC); /* Timestamp matches printk timestamp format. */ - seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus:\n", - show_alloc ? "allocated" : "freed", track->pid, - track->cpu, (unsigned long)ts_sec, rem_nsec / 1000); + seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus (%lu.%06lus ago):\n", + show_alloc ? "allocated" : meta->state == KFENCE_OBJECT_RCU_FREEING ? + "rcu freeing" : "freed", track->pid, + track->cpu, (unsigned long)ts_sec, rem_nsec / 1000, + (unsigned long)interval_nsec, rem_interval_nsec / 1000); if (track->num_stack_entries) { /* Skip allocation/free internals stack. */ @@ -145,7 +150,7 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met kfence_print_stack(seq, meta, true); - if (meta->state == KFENCE_OBJECT_FREED) { + if (meta->state == KFENCE_OBJECT_FREED || meta->state == KFENCE_OBJECT_RCU_FREEING) { seq_con_printf(seq, "\n"); kfence_print_stack(seq, meta, false); } @@ -314,7 +319,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla kpp->kp_slab_cache = meta->cache; kpp->kp_objp = (void *)meta->addr; kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack); - if (meta->state == KFENCE_OBJECT_FREED) + if (meta->state == KFENCE_OBJECT_FREED || meta->state == KFENCE_OBJECT_RCU_FREEING) kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack); /* get_stack_skipnr() ensures the first entry is outside allocator. */ kpp->kp_ret = kpp->kp_stack[0]; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cdd1d8655a76..f9c39898eaff 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -85,7 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * * Note that these are only respected if collapse was initiated by khugepaged. */ -static unsigned int khugepaged_max_ptes_none __read_mostly; +unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_shared __read_mostly; @@ -546,12 +546,14 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, static bool is_refcount_suitable(struct folio *folio) { - int expected_refcount; + int expected_refcount = folio_mapcount(folio); - expected_refcount = folio_mapcount(folio); - if (folio_test_swapcache(folio)) + if (!folio_test_anon(folio) || folio_test_swapcache(folio)) expected_refcount += folio_nr_pages(folio); + if (folio_test_private(folio)) + expected_refcount++; + return folio_ref_count(folio) == expected_refcount; } @@ -625,8 +627,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } /* - * We can do it before isolate_lru_page because the - * page can't be freed from under us. NOTE: PG_lock + * We can do it before folio_isolate_lru because the + * folio can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ @@ -1235,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); + deferred_split_folio(folio, false); spin_unlock(pmd_ptl); folio = NULL; @@ -1841,7 +1844,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } } while (1); - for (index = start; index < end; index++) { + for (index = start; index < end;) { xas_set(&xas, index); folio = xas_load(&xas); @@ -1860,18 +1863,19 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } } nr_none++; + index++; continue; } if (xa_is_value(folio) || !folio_test_uptodate(folio)) { xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_get_folio(mapping->host, index, + if (shmem_get_folio(mapping->host, index, 0, &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } - /* drain lru cache to help isolate_lru_page() */ + /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); } else if (folio_trylock(folio)) { folio_get(folio); @@ -1886,7 +1890,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, page_cache_sync_readahead(mapping, &file->f_ra, file, index, end - index); - /* drain lru cache to help isolate_lru_page() */ + /* drain lru cache to help folio_isolate_lru() */ lru_add_drain(); folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { @@ -1941,12 +1945,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * we locked the first folio, then a THP might be there already. * This will be discovered on the first iteration. */ - if (folio_test_large(folio)) { - result = folio_order(folio) == HPAGE_PMD_ORDER && - folio->index == start - /* Maybe PMD-mapped */ - ? SCAN_PTE_MAPPED_HUGEPAGE - : SCAN_PAGE_COMPOUND; + if (folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start) { + /* Maybe PMD-mapped */ + result = SCAN_PTE_MAPPED_HUGEPAGE; goto out_unlock; } @@ -1986,9 +1988,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* - * We control three references to the folio: + * We control 2 + nr_pages references to the folio: * - we hold a pin on it; - * - one reference from page cache; + * - nr_pages reference from page cache; * - one from lru_isolate_folio; * If those are the only references, then any new usage * of the folio will have to fetch it from the page @@ -1996,7 +1998,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * truncate, so any new usage will be blocked until we * unlock folio after collapse/during rollback. */ - if (folio_ref_count(folio) != 3) { + if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { result = SCAN_PAGE_COUNT; xas_unlock_irq(&xas); folio_putback_lru(folio); @@ -2007,6 +2009,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, * Accumulate the folios that are being collapsed. */ list_add_tail(&folio->lru, &pagelist); + index += folio_nr_pages(folio); continue; out_unlock: folio_unlock(folio); @@ -2054,17 +2057,22 @@ xa_unlocked: index = start; dst = folio_page(new_folio, 0); list_for_each_entry(folio, &pagelist, lru) { + int i, nr_pages = folio_nr_pages(folio); + while (index < folio->index) { clear_highpage(dst); index++; dst++; } - if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) { - result = SCAN_COPY_MC; - goto rollback; + + for (i = 0; i < nr_pages; i++) { + if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { + result = SCAN_COPY_MC; + goto rollback; + } + index++; + dst++; } - index++; - dst++; } while (index < end) { clear_highpage(dst); @@ -2179,7 +2187,7 @@ immap_locked: folio_clear_active(folio); folio_clear_unevictable(folio); folio_unlock(folio); - folio_put_refs(folio, 3); + folio_put_refs(folio, 2 + folio_nr_pages(folio)); } goto out; @@ -2254,16 +2262,10 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, continue; } - /* - * TODO: khugepaged should compact smaller compound pages - * into a PMD sized page - */ - if (folio_test_large(folio)) { - result = folio_order(folio) == HPAGE_PMD_ORDER && - folio->index == start - /* Maybe PMD-mapped */ - ? SCAN_PTE_MAPPED_HUGEPAGE - : SCAN_PAGE_COMPOUND; + if (folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start) { + /* Maybe PMD-mapped */ + result = SCAN_PTE_MAPPED_HUGEPAGE; /* * For SCAN_PTE_MAPPED_HUGEPAGE, further processing * by the caller won't touch the page cache, and so @@ -2285,8 +2287,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, break; } - if (folio_ref_count(folio) != - 1 + folio_mapcount(folio) + folio_test_private(folio)) { + if (!is_refcount_suitable(folio)) { result = SCAN_PAGE_COUNT; break; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 764b08100570..0400f5e8ac60 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -224,6 +224,10 @@ static int kmemleak_error; static unsigned long min_addr = ULONG_MAX; static unsigned long max_addr; +/* minimum and maximum address that may be valid per-CPU pointers */ +static unsigned long min_percpu_addr = ULONG_MAX; +static unsigned long max_percpu_addr; + static struct task_struct *scan_thread; /* used to avoid reporting of recently allocated objects */ static unsigned long jiffies_min_age; @@ -294,13 +298,20 @@ static void hex_dump_object(struct seq_file *seq, const u8 *ptr = (const u8 *)object->pointer; size_t len; - if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU))) + if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) return; + if (object->flags & OBJECT_PERCPU) + ptr = (const u8 *)this_cpu_ptr((void __percpu *)object->pointer); + /* limit the number of lines to HEX_MAX_LINES */ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); - warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); + if (object->flags & OBJECT_PERCPU) + warn_or_seq_printf(seq, " hex dump (first %zu bytes on cpu %d):\n", + len, raw_smp_processor_id()); + else + warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII); @@ -695,10 +706,14 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); /* - * Only update min_addr and max_addr with object - * storing virtual address. + * Only update min_addr and max_addr with object storing virtual + * address. And update min_percpu_addr max_percpu_addr for per-CPU + * objects. */ - if (!(objflags & (OBJECT_PHYS | OBJECT_PERCPU))) { + if (objflags & OBJECT_PERCPU) { + min_percpu_addr = min(min_percpu_addr, untagged_ptr); + max_percpu_addr = max(max_percpu_addr, untagged_ptr + size); + } else if (!(objflags & OBJECT_PHYS)) { min_addr = min(min_addr, untagged_ptr); max_addr = max(max_addr, untagged_ptr + size); } @@ -1055,12 +1070,8 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, { pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size); - /* - * Percpu allocations are only scanned and not reported as leaks - * (min_count is set to 0). - */ - if (kmemleak_enabled && ptr && !IS_ERR(ptr)) - create_object_percpu((unsigned long)ptr, size, 0, gfp); + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + create_object_percpu((__force unsigned long)ptr, size, 0, gfp); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -1134,8 +1145,8 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) { pr_debug("%s(0x%px)\n", __func__, ptr); - if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) - delete_object_full((unsigned long)ptr, OBJECT_PERCPU); + if (kmemleak_free_enabled && ptr && !IS_ERR_PCPU(ptr)) + delete_object_full((__force unsigned long)ptr, OBJECT_PERCPU); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -1304,12 +1315,23 @@ static bool update_checksum(struct kmemleak_object *object) { u32 old_csum = object->checksum; - if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU))) + if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) return false; kasan_disable_current(); kcsan_disable_current(); - object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); + if (object->flags & OBJECT_PERCPU) { + unsigned int cpu; + + object->checksum = 0; + for_each_possible_cpu(cpu) { + void *ptr = per_cpu_ptr((void __percpu *)object->pointer, cpu); + + object->checksum ^= crc32(0, kasan_reset_tag((void *)ptr), object->size); + } + } else { + object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); + } kasan_enable_current(); kcsan_enable_current(); @@ -1340,6 +1362,64 @@ static void update_refs(struct kmemleak_object *object) } } +static void pointer_update_refs(struct kmemleak_object *scanned, + unsigned long pointer, unsigned int objflags) +{ + struct kmemleak_object *object; + unsigned long untagged_ptr; + unsigned long excess_ref; + + untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); + if (objflags & OBJECT_PERCPU) { + if (untagged_ptr < min_percpu_addr || untagged_ptr >= max_percpu_addr) + return; + } else { + if (untagged_ptr < min_addr || untagged_ptr >= max_addr) + return; + } + + /* + * No need for get_object() here since we hold kmemleak_lock. + * object->use_count cannot be dropped to 0 while the object + * is still present in object_tree_root and object_list + * (with updates protected by kmemleak_lock). + */ + object = __lookup_object(pointer, 1, objflags); + if (!object) + return; + if (object == scanned) + /* self referenced, ignore */ + return; + + /* + * Avoid the lockdep recursive warning on object->lock being + * previously acquired in scan_object(). These locks are + * enclosed by scan_mutex. + */ + raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + /* only pass surplus references (object already gray) */ + if (color_gray(object)) { + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); + } + raw_spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + return; + if (object == scanned) + /* circular reference, ignore */ + return; + raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + raw_spin_unlock(&object->lock); + } +} + /* * Memory scanning is a long process and it needs to be interruptible. This * function checks whether such interrupt condition occurred. @@ -1372,13 +1452,10 @@ static void scan_block(void *_start, void *_end, unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); unsigned long flags; - unsigned long untagged_ptr; raw_spin_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { - struct kmemleak_object *object; unsigned long pointer; - unsigned long excess_ref; if (scan_should_stop()) break; @@ -1387,50 +1464,8 @@ static void scan_block(void *_start, void *_end, pointer = *(unsigned long *)kasan_reset_tag((void *)ptr); kasan_enable_current(); - untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); - if (untagged_ptr < min_addr || untagged_ptr >= max_addr) - continue; - - /* - * No need for get_object() here since we hold kmemleak_lock. - * object->use_count cannot be dropped to 0 while the object - * is still present in object_tree_root and object_list - * (with updates protected by kmemleak_lock). - */ - object = lookup_object(pointer, 1); - if (!object) - continue; - if (object == scanned) - /* self referenced, ignore */ - continue; - - /* - * Avoid the lockdep recursive warning on object->lock being - * previously acquired in scan_object(). These locks are - * enclosed by scan_mutex. - */ - raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - /* only pass surplus references (object already gray) */ - if (color_gray(object)) { - excess_ref = object->excess_ref; - /* no need for update_refs() if object already gray */ - } else { - excess_ref = 0; - update_refs(object); - } - raw_spin_unlock(&object->lock); - - if (excess_ref) { - object = lookup_object(excess_ref, 0); - if (!object) - continue; - if (object == scanned) - /* circular reference, ignore */ - continue; - raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - update_refs(object); - raw_spin_unlock(&object->lock); - } + pointer_update_refs(scanned, pointer, 0); + pointer_update_refs(scanned, pointer, OBJECT_PERCPU); } raw_spin_unlock_irqrestore(&kmemleak_lock, flags); } @@ -608,47 +608,6 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, - struct mm_walk *walk) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t *pte; - pte_t ptent; - int ret; - - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - if (!pte) - return 0; - ptent = ptep_get(pte); - if (pte_present(ptent)) { - page = vm_normal_page(walk->vma, addr, ptent); - } else if (!pte_none(ptent)) { - swp_entry_t entry = pte_to_swp_entry(ptent); - - /* - * As KSM pages remain KSM pages until freed, no need to wait - * here for migration to end. - */ - if (is_migration_entry(entry)) - page = pfn_swap_entry_to_page(entry); - } - /* return 1 if the page is an normal ksm page or KSM-placed zero page */ - ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent); - pte_unmap_unlock(pte, ptl); - return ret; -} - -static const struct mm_walk_ops break_ksm_ops = { - .pmd_entry = break_ksm_pmd_entry, - .walk_lock = PGWALK_RDLOCK, -}; - -static const struct mm_walk_ops break_ksm_lock_vma_ops = { - .pmd_entry = break_ksm_pmd_entry, - .walk_lock = PGWALK_WRLOCK, -}; - /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. @@ -665,16 +624,26 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = { static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; - const struct mm_walk_ops *ops = lock_vma ? - &break_ksm_lock_vma_ops : &break_ksm_ops; + + if (lock_vma) + vma_start_write(vma); do { - int ksm_page; + bool ksm_page = false; + struct folio_walk fw; + struct folio *folio; cond_resched(); - ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL); - if (WARN_ON_ONCE(ksm_page < 0)) - return ksm_page; + folio = folio_walk_start(&fw, vma, addr, + FW_MIGRATION | FW_ZEROPAGE); + if (folio) { + /* Small folio implies FW_LEVEL_PTE. */ + if (!folio_test_large(folio) && + (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) + ksm_page = true; + folio_walk_end(&fw, vma); + } + if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, @@ -767,26 +736,28 @@ static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; - struct page *page; + struct page *page = NULL; + struct folio_walk fw; + struct folio *folio; mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; - page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) - goto out; - if (is_zone_device_page(page)) - goto out_putpage; - if (PageAnon(page)) { + folio = folio_walk_start(&fw, vma, addr, 0); + if (folio) { + if (!folio_is_zone_device(folio) && + folio_test_anon(folio)) { + folio_get(folio); + page = fw.page; + } + folio_walk_end(&fw, vma); + } +out: + if (page) { flush_anon_page(vma, page, addr); flush_dcache_page(page); - } else { -out_putpage: - put_page(page); -out: - page = NULL; } mmap_read_unlock(mm); return page; @@ -938,12 +909,13 @@ again: */ while (!folio_try_get(folio)) { /* - * Another check for page->mapping != expected_mapping would - * work here too. We have chosen the !PageSwapCache test to - * optimize the common case, when the page is or is about to - * be freed: PageSwapCache is cleared (under spin_lock_irq) - * in the ref_freeze section of __remove_mapping(); but Anon - * folio->mapping reset to NULL later, in free_pages_prepare(). + * Another check for folio->mapping != expected_mapping + * would work here too. We have chosen to test the + * swapcache flag to optimize the common case, when the + * folio is or is about to be freed: the swapcache flag + * is cleared (under spin_lock_irq) in the ref_freeze + * section of __remove_mapping(); but anon folio->mapping + * is reset to NULL later, in free_pages_prepare(). */ if (!folio_test_swapcache(folio)) goto stale; @@ -974,7 +946,7 @@ again: stale: /* - * We come here from above when page->mapping or !PageSwapCache + * We come here from above when folio->mapping or the swapcache flag * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. @@ -1481,7 +1453,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, goto out; /* - * We need the page lock to read a stable PageSwapCache in + * We need the folio lock to read a stable swapcache flag in * write_protect_page(). We use trylock_page() instead of * lock_page() because we don't want to wait here - we * prefer to continue scanning and merging different pages, @@ -2562,36 +2534,46 @@ next_mm: ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { + struct page *tmp_page = NULL; + struct folio_walk fw; + struct folio *folio; + if (ksm_test_exit(mm)) break; - *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (IS_ERR_OR_NULL(*page)) { - ksm_scan.address += PAGE_SIZE; - cond_resched(); - continue; + + folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); + if (folio) { + if (!folio_is_zone_device(folio) && + folio_test_anon(folio)) { + folio_get(folio); + tmp_page = fw.page; + } + folio_walk_end(&fw, vma); } - if (is_zone_device_page(*page)) - goto next_page; - if (PageAnon(*page)) { - flush_anon_page(vma, *page, ksm_scan.address); - flush_dcache_page(*page); + + if (tmp_page) { + flush_anon_page(vma, tmp_page, ksm_scan.address); + flush_dcache_page(tmp_page); rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; - if (should_skip_rmap_item(*page, rmap_item)) + if (should_skip_rmap_item(tmp_page, rmap_item)) { + folio_put(folio); goto next_page; + } ksm_scan.address += PAGE_SIZE; - } else - put_page(*page); + *page = tmp_page; + } else { + folio_put(folio); + } mmap_read_unlock(mm); return rmap_item; } next_page: - put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } @@ -3142,7 +3124,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible * to ksm_get_folio() before it can see that folio->mapping - * has gone stale (or that folio_test_swapcache has been cleared). + * has gone stale (or that the swapcache flag has been cleared). */ smp_wmb(); folio_set_stable_node(folio, NULL); diff --git a/mm/madvise.c b/mm/madvise.c index 6e3a137b8e50..ff139e57cca2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1031,6 +1031,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; + if (unlikely(!can_modify_vma_madv(vma, behavior))) + return -EPERM; + switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); @@ -1448,15 +1451,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr_remote(mm, start); end = start + len; - /* - * Check if the address range is sealed for do_madvise(). - * can_modify_mm_madv assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { - error = -EPERM; - goto out; - } - blk_start_plug(&plug); switch (behavior) { case MADV_POPULATE_READ: @@ -1470,7 +1464,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh } blk_finish_plug(&plug); -out: if (write) mmap_write_unlock(mm); else diff --git a/mm/memblock.c b/mm/memblock.c index 3b9dc2d89b8a..0a77a748a8eb 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1500,7 +1500,7 @@ done: * * Accept the memory of the allocated buffer. */ - accept_memory(found, found + size); + accept_memory(found, size); return found; } diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 417c96f2da28..b37c0d870816 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -742,6 +742,9 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return folio_file_page(folio, index); } +static void memcg1_check_events(struct mem_cgroup *memcg, int nid); +static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages); + /** * mem_cgroup_move_account - move account of the folio * @folio: The folio. @@ -853,9 +856,9 @@ static int mem_cgroup_move_account(struct folio *folio, nid = folio_nid(folio); local_irq_disable(); - mem_cgroup_charge_statistics(to, nr_pages); + memcg1_charge_statistics(to, nr_pages); memcg1_check_events(to, nid); - mem_cgroup_charge_statistics(from, -nr_pages); + memcg1_charge_statistics(from, -nr_pages); memcg1_check_events(from, nid); local_irq_enable(); out: @@ -1439,21 +1442,68 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg) } } +/* Cgroup1: threshold notifications & softlimit tree updates */ +struct memcg1_events_percpu { + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) +{ + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) + __count_memcg_events(memcg, PGPGIN, 1); + else { + __count_memcg_events(memcg, PGPGOUT, 1); + nr_pages = -nr_pages; /* for event */ + } + + __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages); +} + +#define THRESHOLDS_EVENTS_TARGET 128 +#define SOFTLIMIT_EVENTS_TARGET 1024 + +static bool memcg1_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) +{ + unsigned long val, next; + + val = __this_cpu_read(memcg->events_percpu->nr_page_events); + next = __this_cpu_read(memcg->events_percpu->targets[target]); + /* from time_after() in jiffies.h */ + if ((long)(next - val) < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->events_percpu->targets[target], next); + return true; + } + return false; +} + /* * Check events in order. * */ -void memcg1_check_events(struct mem_cgroup *memcg, int nid) +static void memcg1_check_events(struct mem_cgroup *memcg, int nid) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) return; /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(mem_cgroup_event_ratelimit(memcg, + if (unlikely(memcg1_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { bool do_softlimit; - do_softlimit = mem_cgroup_event_ratelimit(memcg, + do_softlimit = memcg1_event_ratelimit(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) @@ -1461,6 +1511,43 @@ void memcg1_check_events(struct mem_cgroup *memcg, int nid) } } +void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) +{ + unsigned long flags; + + local_irq_save(flags); + memcg1_charge_statistics(memcg, folio_nr_pages(folio)); + memcg1_check_events(memcg, folio_nid(folio)); + local_irq_restore(flags); +} + +void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) +{ + /* + * Interrupts should be disabled here because the caller holds the + * i_pages lock which is taken with interrupts-off. It is + * important here to have the interrupts disabled because it is the + * only synchronisation we have for updating the per-CPU variables. + */ + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); + memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); + preempt_enable_nested(); + memcg1_check_events(memcg, folio_nid(folio)); +} + +void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_memory, int nid) +{ + unsigned long flags; + + local_irq_save(flags); + __count_memcg_events(memcg, PGPGOUT, pgpgout); + __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); + memcg1_check_events(memcg, nid); + local_irq_restore(flags); +} + static int compare_thresholds(const void *a, const void *b) { const struct mem_cgroup_threshold *_a = a; @@ -1907,9 +1994,15 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, event->register_event = mem_cgroup_usage_register_event; event->unregister_event = mem_cgroup_usage_unregister_event; } else if (!strcmp(name, "memory.oom_control")) { + pr_warn_once("oom_control is deprecated and will be removed. " + "Please report your usecase to linux-mm-@kvack.org" + " if you depend on this functionality. \n"); event->register_event = mem_cgroup_oom_register_event; event->unregister_event = mem_cgroup_oom_unregister_event; } else if (!strcmp(name, "memory.pressure_level")) { + pr_warn_once("pressure_level is deprecated and will be removed. " + "Please report your usecase to linux-mm-@kvack.org " + "if you depend on this functionality. \n"); event->register_event = vmpressure_register_event; event->unregister_event = vmpressure_unregister_event; } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { @@ -2447,6 +2540,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = 0; break; case _TCP: + pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); ret = memcg_update_tcp_max(memcg, nr_pages); break; } @@ -2455,6 +2551,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ret = -EOPNOTSUPP; } else { + pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); WRITE_ONCE(memcg->soft_limit, nr_pages); ret = 0; } @@ -2748,6 +2847,10 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + pr_warn_once("oom_control is deprecated and will be removed. " + "Please report your usecase to linux-mm-@kvack.org if you " + "depend on this functionality. \n"); + /* cannot set to root cgroup and only 0 and 1 are allowed */ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) return -EINVAL; @@ -2952,6 +3055,19 @@ bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, return false; } +bool memcg1_alloc_events(struct mem_cgroup *memcg) +{ + memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu, + GFP_KERNEL_ACCOUNT); + return !!memcg->events_percpu; +} + +void memcg1_free_events(struct mem_cgroup *memcg) +{ + if (memcg->events_percpu) + free_percpu(memcg->events_percpu); +} + static int __init memcg1_init(void) { int node; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 56d7eaa98274..c0672e25bcdb 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -7,7 +7,6 @@ /* Cgroup v1 and v2 common declarations */ -void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages); int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages); @@ -56,8 +55,6 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, - enum mem_cgroup_events_target target); unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); void drain_all_stock(struct mem_cgroup *root_memcg); @@ -71,6 +68,10 @@ int memory_stat_show(struct seq_file *m, void *v); /* Cgroup v1-specific declarations */ #ifdef CONFIG_MEMCG_V1 + +bool memcg1_alloc_events(struct mem_cgroup *memcg); +void memcg1_free_events(struct mem_cgroup *memcg); + void memcg1_memcg_init(struct mem_cgroup *memcg); void memcg1_remove_from_trees(struct mem_cgroup *memcg); @@ -99,7 +100,10 @@ bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked); void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked); void memcg1_oom_recover(struct mem_cgroup *memcg); -void memcg1_check_events(struct mem_cgroup *memcg, int nid); +void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg); +void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg); +void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_memory, int nid); void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s); @@ -120,6 +124,9 @@ extern struct cftype mem_cgroup_legacy_files[]; #else /* CONFIG_MEMCG_V1 */ +static inline bool memcg1_alloc_events(struct mem_cgroup *memcg) { return true; } +static inline void memcg1_free_events(struct mem_cgroup *memcg) {} + static inline void memcg1_memcg_init(struct mem_cgroup *memcg) {} static inline void memcg1_remove_from_trees(struct mem_cgroup *memcg) {} static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) {} @@ -130,7 +137,14 @@ static inline bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) { static inline void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) {} static inline void memcg1_oom_recover(struct mem_cgroup *memcg) {} -static inline void memcg1_check_events(struct mem_cgroup *memcg, int nid) {} +static inline void memcg1_commit_charge(struct folio *folio, + struct mem_cgroup *memcg) {} + +static inline void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) {} + +static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg, + unsigned long pgpgout, + unsigned long nr_memory, int nid) {} static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {} @@ -140,8 +154,6 @@ static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr gfp_t gfp_mask) { return true; } static inline void memcg1_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) {} -extern struct cftype memsw_files[]; -extern struct cftype mem_cgroup_legacy_files[]; #endif /* CONFIG_MEMCG_V1 */ #endif /* __MM_MEMCONTROL_V1_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d563fb515766..7845c64a2c57 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,6 +25,7 @@ * Copyright (C) 2020 Alibaba, Inc, Alex Shi */ +#include <linux/cgroup-defs.h> #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/cgroup.h> @@ -41,6 +42,7 @@ #include <linux/rcupdate.h> #include <linux/limits.h> #include <linux/export.h> +#include <linux/list.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/slab.h> @@ -93,9 +95,6 @@ static bool cgroup_memory_nobpf __ro_after_init; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif -#define THRESHOLDS_EVENTS_TARGET 128 -#define SOFTLIMIT_EVENTS_TARGET 1024 - static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || @@ -305,6 +304,12 @@ static const unsigned int memcg_node_stat_items[] = { #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif +#ifdef CONFIG_NUMA_BALANCING + PGPROMOTE_SUCCESS, +#endif + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, }; static const unsigned int memcg_stat_items[] = { @@ -320,24 +325,27 @@ static const unsigned int memcg_stat_items[] = { #define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items) #define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \ ARRAY_SIZE(memcg_stat_items)) -static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; +#define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX) +static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly; static void init_memcg_stats(void) { - int8_t i, j = 0; + u8 i, j = 0; + + BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX); - BUILD_BUG_ON(MEMCG_NR_STAT >= S8_MAX); + memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index)); - for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i) - mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j; + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j) + mem_cgroup_stats_index[memcg_node_stat_items[i]] = j; - for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i) - mem_cgroup_stats_index[memcg_stat_items[i]] = ++j; + for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j) + mem_cgroup_stats_index[memcg_stat_items[i]] = j; } static inline int memcg_stats_index(int idx) { - return mem_cgroup_stats_index[idx] - 1; + return mem_cgroup_stats_index[idx]; } struct lruvec_stats_percpu { @@ -369,7 +377,7 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) return node_page_state(lruvec_pgdat(lruvec), idx); i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -392,7 +400,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -406,8 +414,10 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { +#ifdef CONFIG_MEMCG_V1 PGPGIN, PGPGOUT, +#endif PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, @@ -432,24 +442,32 @@ static const unsigned int memcg_vm_event_stat[] = { THP_SWPOUT, THP_SWPOUT_FALLBACK, #endif +#ifdef CONFIG_NUMA_BALANCING + NUMA_PAGE_MIGRATE, + NUMA_PTE_UPDATES, + NUMA_HINT_FAULTS, +#endif }; #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) -static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; +static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; static void init_memcg_events(void) { - int8_t i; + u8 i; - BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX); + BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX); + + memset(mem_cgroup_events_index, U8_MAX, + sizeof(mem_cgroup_events_index)); for (i = 0; i < NR_MEMCG_EVENTS; ++i) - mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i; } static inline int memcg_events_index(enum vm_event_item idx) { - return mem_cgroup_events_index[idx] - 1; + return mem_cgroup_events_index[idx]; } struct memcg_vmstats_percpu { @@ -469,10 +487,6 @@ struct memcg_vmstats_percpu { /* Delta calculation for lockless upward propagation */ long state_prev[MEMCG_VMSTAT_SIZE]; unsigned long events_prev[NR_MEMCG_EVENTS]; - - /* Cgroup1: threshold notifications & softlimit tree updates */ - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; } ____cacheline_aligned; struct memcg_vmstats { @@ -621,7 +635,7 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) long x; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; x = READ_ONCE(memcg->vmstats->state[i]); @@ -662,7 +676,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (mem_cgroup_disabled()) return; - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; __this_cpu_add(memcg->vmstats_percpu->state[i], val); @@ -675,7 +689,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) long x; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return 0; x = READ_ONCE(memcg->vmstats->state_local[i]); @@ -694,7 +708,7 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, struct mem_cgroup *memcg; int i = memcg_stats_index(idx); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); @@ -810,7 +824,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; memcg_stats_lock(); @@ -823,7 +837,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event) { int i = memcg_events_index(event); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event)) return 0; return READ_ONCE(memcg->vmstats->events[i]); @@ -833,50 +847,12 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { int i = memcg_events_index(event); - if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event)) + if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event)) return 0; return READ_ONCE(memcg->vmstats->events_local[i]); } -void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages) -{ - /* pagein of a big page is an event. So, ignore page size */ - if (nr_pages > 0) - __count_memcg_events(memcg, PGPGIN, 1); - else { - __count_memcg_events(memcg, PGPGOUT, 1); - nr_pages = -nr_pages; /* for event */ - } - - __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); -} - -bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, - enum mem_cgroup_events_target target) -{ - unsigned long val, next; - - val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); - next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); - /* from time_after() in jiffies.h */ - if ((long)(next - val) < 0) { - switch (target) { - case MEM_CGROUP_TARGET_THRESH: - next = val + THRESHOLDS_EVENTS_TARGET; - break; - case MEM_CGROUP_TARGET_SOFTLIMIT: - next = val + SOFTLIMIT_EVENTS_TARGET; - break; - default: - break; - } - __this_cpu_write(memcg->vmstats_percpu->targets[target], next); - return true; - } - return false; -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -971,6 +947,24 @@ again: } /** + * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg. + * @folio: folio from which memcg should be extracted. + */ +struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + + if (mem_cgroup_disabled()) + return NULL; + + rcu_read_lock(); + if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) + memcg = root_mem_cgroup; + rcu_read_unlock(); + return memcg; +} + +/** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation @@ -992,9 +986,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup_reclaim_iter *iter; - struct cgroup_subsys_state *css = NULL; - struct mem_cgroup *memcg = NULL; - struct mem_cgroup *pos = NULL; + struct cgroup_subsys_state *css; + struct mem_cgroup *pos; + struct mem_cgroup *next; if (mem_cgroup_disabled()) return NULL; @@ -1003,81 +997,67 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; rcu_read_lock(); +restart: + next = NULL; if (reclaim) { - struct mem_cgroup_per_node *mz; + int gen; + int nid = reclaim->pgdat->node_id; - mz = root->nodeinfo[reclaim->pgdat->node_id]; - iter = &mz->iter; + iter = &root->nodeinfo[nid]->iter; + gen = atomic_read(&iter->generation); /* * On start, join the current reclaim iteration cycle. * Exit when a concurrent walker completes it. */ if (!prev) - reclaim->generation = iter->generation; - else if (reclaim->generation != iter->generation) + reclaim->generation = gen; + else if (reclaim->generation != gen) goto out_unlock; - while (1) { - pos = READ_ONCE(iter->position); - if (!pos || css_tryget(&pos->css)) - break; - /* - * css reference reached zero, so iter->position will - * be cleared by ->css_released. However, we should not - * rely on this happening soon, because ->css_released - * is called from a work queue, and by busy-waiting we - * might block it. So we clear iter->position right - * away. - */ - (void)cmpxchg(&iter->position, pos, NULL); - } - } else if (prev) { + pos = READ_ONCE(iter->position); + } else pos = prev; - } - if (pos) - css = &pos->css; - - for (;;) { - css = css_next_descendant_pre(css, &root->css); - if (!css) { - /* - * Reclaimers share the hierarchy walk, and a - * new one might jump in right at the end of - * the hierarchy - make sure they see at least - * one group and restart from the beginning. - */ - if (!prev) - continue; - break; - } + css = pos ? &pos->css : NULL; + while ((css = css_next_descendant_pre(css, &root->css))) { /* * Verify the css and acquire a reference. The root * is provided by the caller, so we know it's alive * and kicking, and don't take an extra reference. */ - if (css == &root->css || css_tryget(css)) { - memcg = mem_cgroup_from_css(css); + if (css == &root->css || css_tryget(css)) break; - } } + next = mem_cgroup_from_css(css); + if (reclaim) { /* * The position could have already been updated by a competing * thread, so check that the value hasn't changed since we read * it to avoid reclaiming from the same cgroup twice. */ - (void)cmpxchg(&iter->position, pos, memcg); + if (cmpxchg(&iter->position, pos, next) != pos) { + if (css && css != &root->css) + css_put(css); + goto restart; + } - if (pos) - css_put(&pos->css); + if (!next) { + atomic_inc(&iter->generation); - if (!memcg) - iter->generation++; + /* + * Reclaimers share the hierarchy walk, and a + * new one might jump in right at the end of + * the hierarchy - make sure they see at least + * one group and restart from the beginning. + */ + if (!prev) + goto restart; + } } out_unlock: @@ -1085,7 +1065,7 @@ out_unlock: if (prev && prev != root) css_put(&prev->css); - return memcg; + return next; } /** @@ -1375,6 +1355,13 @@ static const struct memory_stat memory_stats[] = { { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, + + { "pgdemote_kswapd", PGDEMOTE_KSWAPD }, + { "pgdemote_direct", PGDEMOTE_DIRECT }, + { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED }, +#ifdef CONFIG_NUMA_BALANCING + { "pgpromote_success", PGPROMOTE_SUCCESS }, +#endif }; /* The actual unit of the state item, not the same as the output unit */ @@ -1399,6 +1386,9 @@ static int memcg_page_state_output_unit(int item) /* * Workingset state is actually in pages, but we export it to userspace * as a scalar count of events, so special case it here. + * + * Demotion and promotion activities are exported in pages, consistent + * with their global counterparts. */ switch (item) { case WORKINGSET_REFAULT_ANON: @@ -1408,6 +1398,12 @@ static int memcg_page_state_output_unit(int item) case WORKINGSET_RESTORE_ANON: case WORKINGSET_RESTORE_FILE: case WORKINGSET_NODERECLAIM: + case PGDEMOTE_KSWAPD: + case PGDEMOTE_DIRECT: + case PGDEMOTE_KHUGEPAGED: +#ifdef CONFIG_NUMA_BALANCING + case PGPROMOTE_SUCCESS: +#endif return 1; default: return memcg_page_state_unit(item); @@ -1466,10 +1462,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) memcg_events(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { +#ifdef CONFIG_MEMCG_V1 if (memcg_vm_event_stat[i] == PGPGIN || memcg_vm_event_stat[i] == PGPGOUT) continue; - +#endif seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); @@ -2366,7 +2363,7 @@ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) { - VM_BUG_ON_FOLIO(folio_memcg(folio), folio); + VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio); /* * Any of the following ensures page's memcg stability: * @@ -2388,11 +2385,7 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) { css_get(&memcg->css); commit_charge(folio, memcg); - - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio)); - memcg1_check_events(memcg, folio_nid(folio)); - local_irq_enable(); + memcg1_commit_charge(folio, memcg); } static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, @@ -2446,37 +2439,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) /* * Returns a pointer to the memory cgroup to which the kernel object is charged. - * - * A passed kernel object can be a slab object, vmalloc object or a generic - * kernel page, so different mechanisms for getting the memory cgroup pointer - * should be used. - * - * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller - * can not know for sure how the kernel object is implemented. - * mem_cgroup_from_obj() can be safely used in such cases. - * - * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), - * cgroup_mutex, etc. - */ -struct mem_cgroup *mem_cgroup_from_obj(void *p) -{ - struct folio *folio; - - if (mem_cgroup_disabled()) - return NULL; - - if (unlikely(is_vmalloc_addr(p))) - folio = page_folio(vmalloc_to_page(p)); - else - folio = virt_to_folio(p); - - return mem_cgroup_from_obj_folio(folio, p); -} - -/* - * Returns a pointer to the memory cgroup to which the kernel object is charged. - * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, - * allocated using vmalloc(). + * It is not suitable for objects allocated using vmalloc(). * * A passed kernel object must be a slab object or a generic kernel page. * @@ -3057,12 +3020,11 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void split_page_memcg(struct page *head, int old_order, int new_order) { struct folio *folio = page_folio(head); - struct mem_cgroup *memcg = folio_memcg(folio); int i; unsigned int old_nr = 1 << old_order; unsigned int new_nr = 1 << new_order; - if (mem_cgroup_disabled() || !memcg) + if (mem_cgroup_disabled() || !folio_memcg_charged(folio)) return; for (i = new_nr; i < old_nr; i += new_nr) @@ -3071,7 +3033,7 @@ void split_page_memcg(struct page *head, int old_order, int new_order) if (folio_memcg_kmem(folio)) obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1); else - css_get_many(&memcg->css, old_nr / new_nr - 1); + css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1); } unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) @@ -3385,29 +3347,12 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) */ #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) -static DEFINE_IDR(mem_cgroup_idr); -static DEFINE_SPINLOCK(memcg_idr_lock); - -static int mem_cgroup_alloc_id(void) -{ - int ret; - - idr_preload(GFP_KERNEL); - spin_lock(&memcg_idr_lock); - ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1, - GFP_NOWAIT); - spin_unlock(&memcg_idr_lock); - idr_preload_end(); - return ret; -} +static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids); static void mem_cgroup_id_remove(struct mem_cgroup *memcg) { if (memcg->id.id > 0) { - spin_lock(&memcg_idr_lock); - idr_remove(&mem_cgroup_idr, memcg->id.id); - spin_unlock(&memcg_idr_lock); - + xa_erase(&mem_cgroup_ids, memcg->id.id); memcg->id.id = 0; } } @@ -3442,7 +3387,7 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); - return idr_find(&mem_cgroup_idr, id); + return xa_load(&mem_cgroup_ids, id); } #ifdef CONFIG_SHRINKER_DEBUG @@ -3517,6 +3462,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + memcg1_free_events(memcg); kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); @@ -3535,17 +3481,17 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) struct mem_cgroup *memcg; int node, cpu; int __maybe_unused i; - long error = -ENOMEM; + long error; memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); if (!memcg) - return ERR_PTR(error); + return ERR_PTR(-ENOMEM); - memcg->id.id = mem_cgroup_alloc_id(); - if (memcg->id.id < 0) { - error = memcg->id.id; + error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL, + XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL); + if (error) goto fail; - } + error = -ENOMEM; memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL_ACCOUNT); @@ -3557,6 +3503,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) if (!memcg->vmstats_percpu) goto fail; + if (!memcg1_alloc_events(memcg)) + goto fail; + for_each_possible_cpu(cpu) { if (parent) pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); @@ -3574,6 +3523,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) INIT_WORK(&memcg->high_work, high_work_func); vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->memory_peaks); + INIT_LIST_HEAD(&memcg->swap_peaks); + spin_lock_init(&memcg->peaks_lock); memcg->socket_pressure = jiffies; memcg1_memcg_init(memcg); memcg->kmemcg_id = -1; @@ -3619,21 +3571,21 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); - page_counter_init(&memcg->memory, &parent->memory); - page_counter_init(&memcg->swap, &parent->swap); + page_counter_init(&memcg->memory, &parent->memory, true); + page_counter_init(&memcg->swap, &parent->swap, false); #ifdef CONFIG_MEMCG_V1 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); - page_counter_init(&memcg->kmem, &parent->kmem); - page_counter_init(&memcg->tcpmem, &parent->tcpmem); + page_counter_init(&memcg->kmem, &parent->kmem, false); + page_counter_init(&memcg->tcpmem, &parent->tcpmem, false); #endif } else { init_memcg_stats(); init_memcg_events(); - page_counter_init(&memcg->memory, NULL); - page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->memory, NULL, true); + page_counter_init(&memcg->swap, NULL, false); #ifdef CONFIG_MEMCG_V1 - page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcpmem, NULL); + page_counter_init(&memcg->kmem, NULL, false); + page_counter_init(&memcg->tcpmem, NULL, false); #endif root_mem_cgroup = memcg; return &memcg->css; @@ -3682,9 +3634,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) * publish it here at the end of onlining. This matches the * regular ID destruction during offlining. */ - spin_lock(&memcg_idr_lock); - idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); - spin_unlock(&memcg_idr_lock); + xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL); return 0; offline_kmem: @@ -3967,14 +3917,91 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } -static u64 memory_peak_read(struct cgroup_subsys_state *css, - struct cftype *cft) +#define OFP_PEAK_UNSET (((-1UL))) + +static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_of_peak *ofp = of_peak(sf->private); + u64 fd_peak = READ_ONCE(ofp->value), peak; + + /* User wants global or local peak? */ + if (fd_peak == OFP_PEAK_UNSET) + peak = pc->watermark; + else + peak = max(fd_peak, READ_ONCE(pc->local_watermark)); + + seq_printf(sf, "%llu\n", peak * PAGE_SIZE); + return 0; +} + +static int memory_peak_show(struct seq_file *sf, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); - return (u64)memcg->memory.watermark * PAGE_SIZE; + return peak_show(sf, v, &memcg->memory); } +static int peak_open(struct kernfs_open_file *of) +{ + struct cgroup_of_peak *ofp = of_peak(of); + + ofp->value = OFP_PEAK_UNSET; + return 0; +} + +static void peak_release(struct kernfs_open_file *of) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup_of_peak *ofp = of_peak(of); + + if (ofp->value == OFP_PEAK_UNSET) { + /* fast path (no writes on this fd) */ + return; + } + spin_lock(&memcg->peaks_lock); + list_del(&ofp->list); + spin_unlock(&memcg->peaks_lock); +} + +static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes, + loff_t off, struct page_counter *pc, + struct list_head *watchers) +{ + unsigned long usage; + struct cgroup_of_peak *peer_ctx; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup_of_peak *ofp = of_peak(of); + + spin_lock(&memcg->peaks_lock); + + usage = page_counter_read(pc); + WRITE_ONCE(pc->local_watermark, usage); + + list_for_each_entry(peer_ctx, watchers, list) + if (usage > peer_ctx->value) + WRITE_ONCE(peer_ctx->value, usage); + + /* initial write, register watcher */ + if (ofp->value == -1) + list_add(&ofp->list, watchers); + + WRITE_ONCE(ofp->value, usage); + spin_unlock(&memcg->peaks_lock); + + return nbytes; +} + +static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + return peak_write(of, buf, nbytes, off, &memcg->memory, + &memcg->memory_peaks); +} + +#undef OFP_PEAK_UNSET + static int memory_min_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -4324,7 +4351,10 @@ static struct cftype memory_files[] = { { .name = "peak", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = memory_peak_read, + .open = peak_open, + .release = peak_release, + .seq_show = memory_peak_show, + .write = memory_peak_write, }, { .name = "min", @@ -4528,14 +4558,15 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, /* * mem_cgroup_swapin_uncharge_swap - uncharge swap slot - * @entry: swap entry for which the page is charged + * @entry: the first swap entry for which the pages are charged + * @nr_pages: number of pages which will be uncharged * * Call this function after successfully adding the charged page to swapcache. * * Note: This function assumes the page for which swap slot is being uncharged * is order 0 page. */ -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { /* * Cgroup1's unified memory+swap counter has been charged with the @@ -4555,7 +4586,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry, 1); + mem_cgroup_uncharge_swap(entry, nr_pages); } } @@ -4574,8 +4605,6 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long flags; - if (ug->nr_memory) { page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) @@ -4587,11 +4616,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) memcg1_oom_recover(ug->memcg); } - local_irq_save(flags); - __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); - memcg1_check_events(ug->memcg, ug->nid); - local_irq_restore(flags); + memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid); /* drop reference from uncharge_folio */ css_put(&ug->memcg->css); @@ -4606,7 +4631,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_order(folio) > 1 && !folio_test_hugetlb(folio) && - !list_empty(&folio->_deferred_list), folio); + !list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio), folio); /* * Nobody should be changing or seriously looking at @@ -4664,7 +4690,7 @@ void __mem_cgroup_uncharge(struct folio *folio) struct uncharge_gather ug; /* Don't touch folio->lru of any random page, pre-check: */ - if (!folio_memcg(folio)) + if (!folio_memcg_charged(folio)) return; uncharge_gather_clear(&ug); @@ -4698,7 +4724,6 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; long nr_pages = folio_nr_pages(new); - unsigned long flags; VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); @@ -4709,7 +4734,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) return; /* Page cache replacement: new folio already charged? */ - if (folio_memcg(new)) + if (folio_memcg_charged(new)) return; memcg = folio_memcg(old); @@ -4726,11 +4751,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new) css_get(&memcg->css); commit_charge(new, memcg); - - local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, nr_pages); - memcg1_check_events(memcg, folio_nid(new)); - local_irq_restore(flags); + memcg1_commit_charge(new, memcg); } /** @@ -4966,17 +4987,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) page_counter_uncharge(&memcg->memsw, nr_entries); } - /* - * Interrupts should be disabled here because the caller holds the - * i_pages lock which is taken with interrupts-off. It is - * important here to have the interrupts disabled because it is the - * only synchronisation we have for updating the per-CPU variables. - */ - memcg_stats_lock(); - mem_cgroup_charge_statistics(memcg, -nr_entries); - memcg_stats_unlock(); - memcg1_check_events(memcg, folio_nid(folio)); - + memcg1_swapout(folio, memcg); css_put(&memcg->css); } @@ -5116,12 +5127,20 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } -static u64 swap_peak_read(struct cgroup_subsys_state *css, - struct cftype *cft) +static int swap_peak_show(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); + + return peak_show(sf, v, &memcg->swap); +} + +static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - return (u64)memcg->swap.watermark * PAGE_SIZE; + return peak_write(of, buf, nbytes, off, &memcg->swap, + &memcg->swap_peaks); } static int swap_high_show(struct seq_file *m, void *v) @@ -5205,7 +5224,10 @@ static struct cftype swap_files[] = { { .name = "swap.peak", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = swap_peak_read, + .open = peak_open, + .release = peak_release, + .seq_show = swap_peak_show, + .write = swap_peak_write, }, { .name = "swap.events", diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7066fc84f351..96ce31e5a203 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1554,6 +1554,32 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) return ret; } +void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) +{ + if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { + struct address_space *mapping; + + /* + * For hugetlb folios in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_folio_mapping_lock_write(folio); + if (!mapping) { + pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n", + folio_pfn(folio)); + return; + } + + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else { + try_to_unmap(folio, ttu); + } +} + /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -1615,23 +1641,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p, */ collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); - if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) { - /* - * For hugetlb pages in shared mappings, try_to_unmap - * could potentially call huge_pmd_unshare. Because of - * this, take semaphore in write mode here and set - * TTU_RMAP_LOCKED to indicate we have taken the lock - * at this higher level. - */ - mapping = hugetlb_folio_mapping_lock_write(folio); - if (mapping) { - try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); - i_mmap_unlock_write(mapping); - } else - pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn); - } else { - try_to_unmap(folio, ttu); - } + unmap_poisoned_folio(folio, ttu); unmap_success = !folio_mapped(folio); if (!unmap_success) @@ -2643,40 +2653,6 @@ EXPORT_SYMBOL(unpoison_memory); #undef pr_fmt #define pr_fmt(fmt) "Soft offline: " fmt -static bool mf_isolate_folio(struct folio *folio, struct list_head *pagelist) -{ - bool isolated = false; - - if (folio_test_hugetlb(folio)) { - isolated = isolate_hugetlb(folio, pagelist); - } else { - bool lru = !__folio_test_movable(folio); - - if (lru) - isolated = folio_isolate_lru(folio); - else - isolated = isolate_movable_page(&folio->page, - ISOLATE_UNEVICTABLE); - - if (isolated) { - list_add(&folio->lru, pagelist); - if (lru) - node_stat_add_folio(folio, NR_ISOLATED_ANON + - folio_is_file_lru(folio)); - } - } - - /* - * If we succeed to isolate the folio, we grabbed another refcount on - * the folio, so we can safely drop the one we got from get_any_page(). - * If we failed to isolate the folio, it means that we cannot go further - * and we will return an error, so drop the reference we got from - * get_any_page() as well. - */ - folio_put(folio); - return isolated; -} - /* * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. * If the page is a non-dirty unmapped page-cache page, it simply invalidates. @@ -2689,6 +2665,7 @@ static int soft_offline_in_use_page(struct page *page) struct folio *folio = page_folio(page); char const *msg_page[] = {"page", "hugepage"}; bool huge = folio_test_hugetlb(folio); + bool isolated; LIST_HEAD(pagelist); struct migration_target_control mtc = { .nid = NUMA_NO_NODE, @@ -2728,7 +2705,18 @@ static int soft_offline_in_use_page(struct page *page) return 0; } - if (mf_isolate_folio(folio, &pagelist)) { + isolated = isolate_folio_to_list(folio, &pagelist); + + /* + * If we succeed to isolate the folio, we grabbed another refcount on + * the folio, so we can safely drop the one we got from get_any_page(). + * If we failed to isolate the folio, it means that we cannot go further + * and we will return an error, so drop the reference we got from + * get_any_page() as well. + */ + folio_put(folio); + + if (isolated) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 4775b3a3dabe..ed7607f692bd 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -6,6 +6,7 @@ #include <linux/memory.h> #include <linux/memory-tiers.h> #include <linux/notifier.h> +#include <linux/sched/sysctl.h> #include "internal.h" @@ -50,6 +51,24 @@ static const struct bus_type memory_tier_subsys = { .dev_name = "memory_tier", }; +#ifdef CONFIG_NUMA_BALANCING +/** + * folio_use_access_time - check if a folio reuses cpupid for page access time + * @folio: folio to check + * + * folio's _last_cpupid field is repurposed by memory tiering. In memory + * tiering mode, cpupid of slow memory folio (not toptier memory) is used to + * record page access time. + * + * Return: the folio _last_cpupid is used to record page access time + */ +bool folio_use_access_time(struct folio *folio) +{ + return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(folio_nid(folio)); +} +#endif + #ifdef CONFIG_MIGRATION static int top_tier_adistance; /* @@ -895,13 +914,14 @@ static int __init memory_tier_init(void) WARN_ON(!node_demotion); #endif - guard(mutex)(&memory_tier_lock); + mutex_lock(&memory_tier_lock); /* * For now we can have 4 faster memory tiers with smaller adistance * than default DRAM tier. */ default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM, &default_memory_types); + mutex_unlock(&memory_tier_lock); if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); diff --git a/mm/memory.c b/mm/memory.c index cda2c12c500b..2366578015ad 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -666,17 +666,16 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, return NULL; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); - /* - * There is no pmd_special() but there may be special pmds, e.g. - * in a direct-access (dax) mapping, so let's just replicate the - * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. - */ + /* Currently it's only used for huge pfnmaps */ + if (unlikely(pmd_special(pmd))) + return NULL; + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) @@ -927,8 +926,11 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * We have a prealloc page, all good! Take it * over and copy the page & arm it. */ + + if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) + return -EHWPOISON; + *prealloc = NULL; - copy_user_highpage(&new_folio->page, page, addr, src_vma); __folio_mark_uptodate(new_folio); folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); @@ -1167,8 +1169,9 @@ again: /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. + * If copy failed due to hwpoison in source page, break out. */ - if (unlikely(ret == -EAGAIN)) + if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) break; if (unlikely(prealloc)) { /* @@ -1198,7 +1201,7 @@ again: goto out; } entry.val = 0; - } else if (ret == -EBUSY) { + } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { goto out; } else if (ret == -EAGAIN) { prealloc = folio_prealloc(src_mm, src_vma, addr, false); @@ -4001,6 +4004,194 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +static struct folio *__alloc_swap_folio(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + swp_entry_t entry; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + vmf->address, false); + if (!folio) + return NULL; + + entry = pte_to_swp_entry(vmf->orig_pte); + if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, + GFP_KERNEL, entry)) { + folio_put(folio); + return NULL; + } + + return folio; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing swap_read_folio, which is + * the case the being faulted pte doesn't have swapcache. We need to + * ensure all PTEs have no cache as well, otherwise, we might go to + * swap devices while the content is in swapcache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + +/* + * Check if the PTEs within a range are contiguous swap entries + * and have consistent swapcache, zeromap. + */ +static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) +{ + unsigned long addr; + swp_entry_t entry; + int idx; + pte_t pte; + + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); + idx = (vmf->address - addr) / PAGE_SIZE; + pte = ptep_get(ptep); + + if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) + return false; + entry = pte_to_swp_entry(pte); + if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) + return false; + + /* + * swap_read_folio() can't handle the case a large folio is hybridly + * from different backends. And they are likely corner cases. Similar + * things might be added once zswap support large folios. + */ + if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) + return false; + if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) + return false; + + return true; +} + +static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, + unsigned long addr, + unsigned long orders) +{ + int order, nr; + + order = highest_order(orders); + + /* + * To swap in a THP with nr pages, we require that its first swap_offset + * is aligned with that number, as it was when the THP was swapped out. + * This helps filter out most invalid entries. + */ + while (orders) { + nr = 1 << order; + if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr) + break; + order = next_order(&orders, order); + } + + return orders; +} + +static struct folio *alloc_swap_folio(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long orders; + struct folio *folio; + unsigned long addr; + swp_entry_t entry; + spinlock_t *ptl; + pte_t *pte; + gfp_t gfp; + int order; + + /* + * If uffd is active for the vma we need per-page fault fidelity to + * maintain the uffd semantics. + */ + if (unlikely(userfaultfd_armed(vma))) + goto fallback; + + /* + * A large swapped out folio could be partially or fully in zswap. We + * lack handling for such cases, so fallback to swapping in order-0 + * folio. + */ + if (!zswap_never_enabled()) + goto fallback; + + entry = pte_to_swp_entry(vmf->orig_pte); + /* + * Get a list of all the (large) orders below PMD_ORDER that are enabled + * and suitable for swapping THP. + */ + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + orders = thp_swap_suitable_orders(swp_offset(entry), + vmf->address, orders); + + if (!orders) + goto fallback; + + pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address & PMD_MASK, &ptl); + if (unlikely(!pte)) + goto fallback; + + /* + * For do_swap_page, find the highest order where the aligned range is + * completely swap entries with contiguous swap offsets. + */ + order = highest_order(orders); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order)) + break; + order = next_order(&orders, order); + } + + pte_unmap_unlock(pte, ptl); + + /* Try allocating the highest of the remaining orders. */ + gfp = vma_thp_gfp_mask(vma); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vma, addr, true); + if (folio) { + if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, + gfp, entry)) + return folio; + folio_put(folio); + } + order = next_order(&orders, order); + } + +fallback: + return __alloc_swap_folio(vmf); +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) +{ + return false; +} + +static struct folio *alloc_swap_folio(struct vm_fault *vmf) +{ + return __alloc_swap_folio(vmf); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -4089,35 +4280,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { - /* - * Prevent parallel swapin from proceeding with - * the cache flag. Otherwise, another thread may - * finish swapin first, free the entry, and swapout - * reusing the same entry. It's undetectable as - * pte_same() returns true due to entry reuse. - */ - if (swapcache_prepare(entry)) { - /* Relax a bit to prevent rapid repeated page faults */ - schedule_timeout_uninterruptible(1); - goto out; - } - need_clear_cache = true; - /* skip swapcache */ - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, - vma, vmf->address, false); - page = &folio->page; + folio = alloc_swap_folio(vmf); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_folio(folio, - vma->vm_mm, GFP_KERNEL, - entry)) { - ret = VM_FAULT_OOM; + nr_pages = folio_nr_pages(folio); + if (folio_test_large(folio)) + entry.val = ALIGN_DOWN(entry.val, nr_pages); + /* + * Prevent parallel swapin from proceeding with + * the cache flag. Otherwise, another thread + * may finish swapin first, free the entry, and + * swapout reusing the same entry. It's + * undetectable as pte_same() returns true due + * to entry reuse. + */ + if (swapcache_prepare(entry, nr_pages)) { + /* + * Relax a bit to prevent rapid + * repeated page faults. + */ + schedule_timeout_uninterruptible(1); goto out_page; } - mem_cgroup_swapin_uncharge_swap(entry); + need_clear_cache = true; + + mem_cgroup_swapin_uncharge_swap(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) @@ -4131,10 +4321,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio->private = NULL; } } else { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - if (page) - folio = page_folio(page); swapcache = folio; } @@ -4155,6 +4343,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); + page = folio_file_page(folio, swp_offset(entry)); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -4224,6 +4413,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } + /* allocated large folios for SWP_SYNCHRONOUS_IO */ + if (folio_test_large(folio) && !folio_test_swapcache(folio)) { + unsigned long nr = folio_nr_pages(folio); + unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); + unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; + pte_t *folio_ptep = vmf->pte - idx; + pte_t folio_pte = ptep_get(folio_ptep); + + if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || + swap_pte_batch(folio_ptep, nr, folio_pte) != nr) + goto out_nomap; + + page_idx = idx; + address = folio_start; + ptep = folio_ptep; + goto check_folio; + } + nr_pages = 1; page_idx = 0; address = vmf->address; @@ -4355,11 +4562,12 @@ check_folio: folio_add_lru_vma(folio, vma); } else if (!folio_test_anon(folio)) { /* - * We currently only expect small !anon folios, which are either - * fully exclusive or fully shared. If we ever get large folios - * here, we have to be careful. + * We currently only expect small !anon folios which are either + * fully exclusive or fully shared, or new allocated large + * folios which are fully exclusive. If we ever get large + * folios within swapcache here, we have to be careful. */ - VM_WARN_ON_ONCE(folio_test_large(folio)); + VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { @@ -4402,7 +4610,7 @@ unlock: out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) - swapcache_clear(si, entry); + swapcache_clear(si, entry, nr_pages); if (si) put_swap_device(si); return ret; @@ -4418,7 +4626,7 @@ out_release: folio_put(swapcache); } if (need_clear_cache) - swapcache_clear(si, entry); + swapcache_clear(si, entry, nr_pages); if (si) put_swap_device(si); return ret; @@ -4612,9 +4820,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); -#endif folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: @@ -5109,10 +5315,14 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (ret & VM_FAULT_DONE_COW) return ret; - copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { + ret = VM_FAULT_HWPOISON; + goto unlock; + } __folio_mark_uptodate(folio); ret |= finish_fault(vmf); +unlock: unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -5217,16 +5427,46 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, - unsigned long addr, int page_nid, int *flags) +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, + unsigned long addr, int *flags, + bool writable, int *last_cpupid) { struct vm_area_struct *vma = vmf->vma; + /* + * Avoid grouping on RO pages in general. RO pages shouldn't hurt as + * much anyway since they can be in shared cache state. This misses + * the case where a mapping is writable but the process never writes + * to it but pte_write gets cleared during protection updates and + * pte_dirty has unpredictable behaviour between PTE scan updates, + * background writeback, dirty balancing and application behaviour. + */ + if (!writable) + *flags |= TNF_NO_GROUP; + + /* + * Flag if the folio is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) + *flags |= TNF_SHARED; + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if (folio_use_access_time(folio)) + *last_cpupid = (-1 & LAST_CPUPID_MASK); + else + *last_cpupid = folio_last_cpupid(folio); + /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == numa_node_id()) { +#ifdef CONFIG_NUMA_BALANCING + count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1); +#endif + if (folio_nid(folio) == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); *flags |= TNF_FAULT_LOCAL; } @@ -5328,36 +5568,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; - /* - * Avoid grouping on RO pages in general. RO pages shouldn't hurt as - * much anyway since they can be in shared cache state. This misses - * the case where a mapping is writable but the process never writes - * to it but pte_write gets cleared during protection updates and - * pte_dirty has unpredictable behaviour between PTE scan updates, - * background writeback, dirty balancing and application behaviour. - */ - if (!writable) - flags |= TNF_NO_GROUP; - - /* - * Flag if the folio is shared between multiple address spaces. This - * is later used when determining whether to group tasks together - */ - if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) - flags |= TNF_SHARED; - nid = folio_nid(folio); nr_pages = folio_nr_pages(folio); - /* - * For memory tiering mode, cpupid of slow memory page is used - * to record page access time. So use default value. - */ - if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && - !node_is_toptier(nid)) - last_cpupid = (-1 & LAST_CPUPID_MASK); - else - last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags); + + target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, + writable, &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { @@ -6013,10 +6228,6 @@ retry: if (!vma_start_read(vma)) goto inval; - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; - /* Check if the VMA got isolated after we found it */ if (vma->detached) { vma_end_read(vma); @@ -6024,6 +6235,16 @@ retry: /* The area was replaced with another one */ goto retry; } + /* + * At this point, we have a stable reference to a VMA: The VMA is + * locked and we know it hasn't already been isolated. + * From here on, we can access the VMA without worrying about which + * fields are accessible for RCU readers. + */ + + /* Check since vm_start/vm_end might change before we lock the VMA */ + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + goto inval_end_read; rcu_read_unlock(); return vma; @@ -6108,78 +6329,155 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ +static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, + spinlock_t *lock, pte_t *ptep, + pgprot_t pgprot, unsigned long pfn_base, + unsigned long addr_mask, bool writable, + bool special) +{ + args->lock = lock; + args->ptep = ptep; + args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); + args->pgprot = pgprot; + args->writable = writable; + args->special = special; +} + +static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) +{ +#ifdef CONFIG_LOCKDEP + struct address_space *mapping = vma->vm_file->f_mapping; + + if (mapping) + lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || + lockdep_is_held(&vma->vm_mm->mmap_lock)); + else + lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); +#endif +} + /** - * follow_pte - look up PTE at a user virtual address - * @vma: the memory mapping - * @address: user virtual address - * @ptepp: location to store found PTE - * @ptlp: location to store the lock for the PTE + * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address + * @args: Pointer to struct @follow_pfnmap_args + * + * The caller needs to setup args->vma and args->address to point to the + * virtual address as the target of such lookup. On a successful return, + * the results will be put into other output fields. * - * On a successful return, the pointer to the PTE is stored in @ptepp; - * the corresponding lock is taken and its location is stored in @ptlp. + * After the caller finished using the fields, the caller must invoke + * another follow_pfnmap_end() to proper releases the locks and resources + * of such look up request. * - * The contents of the PTE are only stable until @ptlp is released using - * pte_unmap_unlock(). This function will fail if the PTE is non-present. - * Present PTEs may include PTEs that map refcounted pages, such as - * anonymous folios in COW mappings. + * During the start() and end() calls, the results in @args will be valid + * as proper locks will be held. After the end() is called, all the fields + * in @follow_pfnmap_args will be invalid to be further accessed. Further + * use of such information after end() may require proper synchronizations + * by the caller with page table updates, otherwise it can create a + * security bug. * - * Callers must be careful when relying on PTE content after - * pte_unmap_unlock(). Especially if the PTE maps a refcounted page, - * callers must protect against invalidation with MMU notifiers; otherwise - * access to the PFN at a later point in time can trigger use-after-free. + * If the PTE maps a refcounted page, callers are responsible to protect + * against invalidation with MMU notifiers; otherwise access to the PFN at + * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore - * should be taken for read. + * should be taken for read, and the mmap semaphore cannot be released + * before the end() is invoked. * * This function must not be used to modify PTE content. * - * Return: zero on success, -ve otherwise. + * Return: zero on success, negative otherwise. */ -int follow_pte(struct vm_area_struct *vma, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pfnmap_start(struct follow_pfnmap_args *args) { + struct vm_area_struct *vma = args->vma; + unsigned long address = args->address; struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; + spinlock_t *lock; + pgd_t *pgdp; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pfnmap_lockdep_assert(vma); - mmap_assert_locked(mm); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) +retry: + pgdp = pgd_offset(mm, address); + if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + p4dp = p4d_offset(pgdp, address); + p4d = READ_ONCE(*p4dp); + if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; - pud = pud_offset(p4d, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + pudp = pud_offset(p4dp, address); + pud = READ_ONCE(*pudp); + if (pud_none(pud)) goto out; + if (pud_leaf(pud)) { + lock = pud_lock(mm, pudp); + if (!unlikely(pud_leaf(pud))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), + pud_pfn(pud), PUD_MASK, pud_write(pud), + pud_special(pud)); + return 0; + } - pmd = pmd_offset(pud, address); - VM_BUG_ON(pmd_trans_huge(*pmd)); + pmdp = pmd_offset(pudp, address); + pmd = pmdp_get_lockless(pmdp); + if (pmd_leaf(pmd)) { + lock = pmd_lock(mm, pmdp); + if (!unlikely(pmd_leaf(pmd))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), + pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), + pmd_special(pmd)); + return 0; + } - ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; - if (!pte_present(ptep_get(ptep))) + pte = ptep_get(ptep); + if (!pte_present(pte)) goto unlock; - *ptepp = ptep; + pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), + pte_pfn(pte), PAGE_MASK, pte_write(pte), + pte_special(pte)); return 0; unlock: - pte_unmap_unlock(ptep, *ptlp); + pte_unmap_unlock(ptep, lock); out: return -EINVAL; } -EXPORT_SYMBOL_GPL(follow_pte); +EXPORT_SYMBOL_GPL(follow_pfnmap_start); + +/** + * follow_pfnmap_end(): End a follow_pfnmap_start() process + * @args: Pointer to struct @follow_pfnmap_args + * + * Must be used in pair of follow_pfnmap_start(). See the start() function + * above for more information. + */ +void follow_pfnmap_end(struct follow_pfnmap_args *args) +{ + if (args->lock) + spin_unlock(args->lock); + if (args->ptep) + pte_unmap(args->ptep); +} +EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT /** @@ -6200,34 +6498,34 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; - pte_t *ptep, pte; - spinlock_t *ptl; int offset = offset_in_page(addr); int ret = -EINVAL; + bool writable; + struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: - if (follow_pte(vma, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) return -EINVAL; - pte = ptep_get(ptep); - pte_unmap_unlock(ptep, ptl); - - prot = pgprot_val(pte_pgprot(pte)); - phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + prot = pgprot_val(args.pgprot); + phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; + writable = args.writable; + follow_pfnmap_end(&args); - if ((write & FOLL_WRITE) && !pte_write(pte)) + if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; - if (follow_pte(vma, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) goto out_unmap; - if (!pte_same(pte, ptep_get(ptep))) { - pte_unmap_unlock(ptep, ptl); + if ((prot != pgprot_val(args.pgprot)) || + (phys_addr != (args.pfn << PAGE_SHIFT)) || + (writable != args.writable)) { + follow_pfnmap_end(&args); iounmap(maddr); - goto retry; } @@ -6236,7 +6534,7 @@ retry: else memcpy_fromio(buf, maddr + offset, len); ret = len; - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unmap: iounmap(maddr); @@ -6587,7 +6885,7 @@ long copy_folio_from_user(struct folio *dst_folio, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 951878ab627a..621ae1015106 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -366,7 +366,7 @@ struct page *pfn_to_online_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(pfn_to_online_page); -int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, +int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { const unsigned long end_pfn = pfn + nr_pages; @@ -524,7 +524,7 @@ static void update_pgdat_span(struct pglist_data *pgdat) pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } -void __ref remove_pfn_range_from_zone(struct zone *zone, +void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -629,7 +629,7 @@ int restore_online_page_callback(online_page_callback_t callback) EXPORT_SYMBOL_GPL(restore_online_page_callback); /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -void __ref generic_online_page(struct page *page, unsigned int order) +void generic_online_page(struct page *page, unsigned int order) { __free_pages_core(page, order, MEMINIT_HOTPLUG); } @@ -741,7 +741,7 @@ static inline void section_taint_zone_device(unsigned long pfn) * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ -void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, +void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype) { @@ -1143,7 +1143,7 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) /* * Must be called with mem_hotplug_lock in write mode. */ -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, +int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { unsigned long flags; @@ -1233,7 +1233,7 @@ failed_addition: } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_init_pgdat(int nid) +static pg_data_t *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; @@ -1386,7 +1386,7 @@ bool mhp_supports_memmap_on_memory(void) } EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); -static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) +static void remove_memory_blocks_and_altmaps(u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; @@ -1473,7 +1473,7 @@ out: * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) +int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; @@ -1580,7 +1580,7 @@ error_mem_hotplug_end: } /* requires device_hotplug_lock, see add_memory_resource() */ -int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) +int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; int ret; @@ -1772,67 +1772,59 @@ found: static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { + struct folio *folio; unsigned long pfn; - struct page *page, *head; LIST_HEAD(source); static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); for (pfn = start_pfn; pfn < end_pfn; pfn++) { - struct folio *folio; - bool isolated; + struct page *page; if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); folio = page_folio(page); - head = &folio->page; - if (PageHuge(page)) { - pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(folio, &source); - continue; - } else if (PageTransHuge(page)) - pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; + /* + * No reference or lock is held on the folio, so it might + * be modified concurrently (e.g. split). As such, + * folio_nr_pages() may read garbage. This is fine as the outer + * loop will revisit the split folio later. + */ + if (folio_test_large(folio)) + pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; /* * HWPoison pages have elevated reference counts so the migration would * fail on them. It also doesn't make any sense to migrate them in the * first place. Still try to unmap such a page in case it is still mapped - * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep - * the unmap as the catch all safety net). + * (keep the unmap as the catch all safety net). */ - if (PageHWPoison(page)) { + if (folio_test_hwpoison(folio) || + (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); if (folio_mapped(folio)) - try_to_unmap(folio, TTU_IGNORE_MLOCK); + unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK); continue; } - if (!get_page_unless_zero(page)) + if (!folio_try_get(folio)) continue; - /* - * We can skip free pages. And we can deal with pages on - * LRU and non-lru movable pages. - */ - if (PageLRU(page)) - isolated = isolate_lru_page(page); - else - isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE); - if (isolated) { - list_add_tail(&page->lru, &source); - if (!__PageMovable(page)) - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_lru(page)); - } else { + if (unlikely(page_folio(page) != folio)) + goto put_folio; + + if (!isolate_folio_to_list(folio, &source)) { if (__ratelimit(&migrate_rs)) { - pr_warn("failed to isolate pfn %lx\n", pfn); + pr_warn("failed to isolate pfn %lx\n", + page_to_pfn(page)); dump_page(page, "isolation failed"); } } - put_page(page); +put_folio: + folio_put(folio); } if (!list_empty(&source)) { nodemask_t nmask = node_states[N_MEMORY]; @@ -1847,7 +1839,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We have checked that migration range is on a single zone so * we can use the nid of the first page to all the others. */ - mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru)); /* * try to allocate from a different node but reuse this node @@ -1860,11 +1852,12 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) ret = migrate_pages(&source, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { - list_for_each_entry(page, &source, lru) { + list_for_each_entry(folio, &source, lru) { if (__ratelimit(&migrate_rs)) { pr_warn("migrating pfn %lx failed ret:%d\n", - page_to_pfn(page), ret); - dump_page(page, "migration failure"); + folio_pfn(folio), ret); + dump_page(&folio->page, + "migration failure"); } } putback_movable_pages(&source); @@ -1939,7 +1932,7 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, /* * Must be called with mem_hotplug_lock in write mode. */ -int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, +int offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; @@ -2240,7 +2233,7 @@ static int memory_blocks_have_altmaps(u64 start, u64 size) return 1; } -static int __ref try_remove_memory(u64 start, u64 size) +static int try_remove_memory(u64 start, u64 size) { int rc, nid = NUMA_NO_NODE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b858e22b259d..b646fab3e45e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -676,8 +676,10 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); - if (nr_updated > 0) + if (nr_updated > 0) { count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated); + } tlb_finish_mmu(&tlb); @@ -1951,7 +1953,7 @@ unsigned int mempolicy_slab_node(void) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); - return z->zone ? zone_to_nid(z->zone) : node; + return zonelist_zone(z) ? zonelist_node_idx(z) : node; } case MPOL_LOCAL: return node; @@ -2809,7 +2811,7 @@ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, node_zonelist(thisnid, GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); - polnid = zone_to_nid(z->zone); + polnid = zonelist_node_idx(z); break; default: diff --git a/mm/migrate.c b/mm/migrate.c index 923ea80ba744..dfdb3a136bf8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -20,7 +20,6 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/mm_inline.h> -#include <linux/nsproxy.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/topology.h> @@ -35,21 +34,16 @@ #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/hugetlb.h> -#include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> #include <linux/pfn_t.h> -#include <linux/memremap.h> -#include <linux/userfaultfd_k.h> -#include <linux/balloon_compaction.h> #include <linux/page_idle.h> #include <linux/page_owner.h> #include <linux/sched/mm.h> #include <linux/ptrace.h> -#include <linux/oom.h> #include <linux/memory.h> -#include <linux/random.h> #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> +#include <linux/pagewalk.h> #include <asm/tlbflush.h> @@ -177,13 +171,83 @@ void putback_movable_pages(struct list_head *l) } } +/* Must be called with an elevated refcount on the non-hugetlb folio */ +bool isolate_folio_to_list(struct folio *folio, struct list_head *list) +{ + bool isolated, lru; + + if (folio_test_hugetlb(folio)) + return isolate_hugetlb(folio, list); + + lru = !__folio_test_movable(folio); + if (lru) + isolated = folio_isolate_lru(folio); + else + isolated = isolate_movable_page(&folio->page, + ISOLATE_UNEVICTABLE); + + if (!isolated) + return false; + + list_add(&folio->lru, list); + if (lru) + node_stat_add_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio)); + + return true; +} + +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, + struct folio *folio, + unsigned long idx) +{ + struct page *page = folio_page(folio, idx); + bool contains_data; + pte_t newpte; + void *addr; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || + mm_forbids_zeropage(pvmw->vma->vm_mm)) + return false; + + /* + * The pmd entry mapping the old thp was flushed and the pte mapping + * this subpage has been non present. If the subpage is only zero-filled + * then map it to the shared zeropage. + */ + addr = kmap_local_page(page); + contains_data = memchr_inv(addr, 0, PAGE_SIZE); + kunmap_local(addr); + + if (contains_data) + return false; + + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), + pvmw->vma->vm_page_prot)); + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); + + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio)); + return true; +} + +struct rmap_walk_arg { + struct folio *folio; + bool map_unused_to_zeropage; +}; + /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *old) + struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + struct rmap_walk_arg *rmap_walk_arg = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; @@ -207,6 +271,9 @@ static bool remove_migration_pte(struct folio *folio, continue; } #endif + if (rmap_walk_arg->map_unused_to_zeropage && + try_to_map_unused_to_zeropage(&pvmw, folio, idx)) + continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); @@ -285,14 +352,21 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) { + struct rmap_walk_arg rmap_walk_arg = { + .folio = src, + .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, + }; + struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = src, + .arg = &rmap_walk_arg, }; - if (locked) + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); + + if (flags & RMP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); @@ -422,6 +496,8 @@ static int __folio_migrate_mapping(struct address_space *mapping, /* No turning back from here */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; + if (folio_test_anon(folio) && folio_test_large(folio)) + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); @@ -446,6 +522,8 @@ static int __folio_migrate_mapping(struct address_space *mapping, */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; + if (folio_test_anon(folio) && folio_test_large(folio)) + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ if (folio_test_swapbacked(folio)) { __folio_set_swapbacked(newfolio); @@ -585,8 +663,6 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) { int cpupid; - if (folio_test_error(folio)) - folio_set_error(newfolio); if (folio_test_referenced(folio)) folio_set_referenced(newfolio); if (folio_test_uptodate(folio)) @@ -640,7 +716,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's - * ksm_get_folio() depends upon ksm_migrate_page() and PageSwapCache(). + * ksm_get_folio() depends upon ksm_migrate_page() and the + * swapcache flag. */ if (folio_test_swapcache(folio)) folio_clear_swapcache(folio); @@ -666,6 +743,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_set_readahead(newfolio); folio_copy_owner(newfolio, folio); + pgalloc_tag_copy(newfolio, folio); mem_cgroup_migrate(folio, newfolio); } @@ -904,7 +982,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, 0); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1068,7 +1146,7 @@ static void migrate_folio_undo_src(struct folio *src, struct list_head *ret) { if (page_was_mapped) - remove_migration_ptes(src, src, false); + remove_migration_ptes(src, src, 0); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); @@ -1306,7 +1384,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, lru_add_drain(); if (old_page_state & PAGE_WAS_MAPPED) - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, 0); out_unlock_both: folio_unlock(dst); @@ -1444,7 +1522,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1682,7 +1760,8 @@ static int migrate_pages_batch(struct list_head *from, * use _deferred_list. */ if (nr_pages > 2 && - !list_empty(&folio->_deferred_list)) { + !list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) { if (!try_split_folio(folio, split_folios, mode)) { nr_failed++; stats->nr_thp_failed += is_thp; @@ -2111,76 +2190,66 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node) return err; } +static int __add_folio_for_migration(struct folio *folio, int node, + struct list_head *pagelist, bool migrate_all) +{ + if (is_zero_folio(folio) || is_huge_zero_folio(folio)) + return -EFAULT; + + if (folio_is_zone_device(folio)) + return -ENOENT; + + if (folio_nid(folio) == node) + return 0; + + if (folio_likely_mapped_shared(folio) && !migrate_all) + return -EACCES; + + if (folio_test_hugetlb(folio)) { + if (isolate_hugetlb(folio, pagelist)) + return 1; + } else if (folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, pagelist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + return 1; + } + return -EBUSY; +} + /* - * Resolves the given address to a struct page, isolates it from the LRU and + * Resolves the given address to a struct folio, isolates it from the LRU and * puts it to the given pagelist. * Returns: - * errno - if the page cannot be found/isolated + * errno - if the folio cannot be found/isolated * 0 - when it doesn't have to be migrated because it is already on the * target node * 1 - when it has been queued */ -static int add_page_for_migration(struct mm_struct *mm, const void __user *p, +static int add_folio_for_migration(struct mm_struct *mm, const void __user *p, int node, struct list_head *pagelist, bool migrate_all) { struct vm_area_struct *vma; - unsigned long addr; - struct page *page; + struct folio_walk fw; struct folio *folio; - int err; + unsigned long addr; + int err = -EFAULT; mmap_read_lock(mm); addr = (unsigned long)untagged_addr_remote(mm, p); - err = -EFAULT; vma = vma_lookup(mm, addr); - if (!vma || !vma_migratable(vma)) - goto out; - - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - - err = -ENOENT; - if (!page) - goto out; - - folio = page_folio(page); - if (folio_is_zone_device(folio)) - goto out_putfolio; - - err = 0; - if (folio_nid(folio) == node) - goto out_putfolio; - - err = -EACCES; - if (folio_likely_mapped_shared(folio) && !migrate_all) - goto out_putfolio; - - err = -EBUSY; - if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(folio, pagelist)) - err = 1; - } else { - if (!folio_isolate_lru(folio)) - goto out_putfolio; - - err = 1; - list_add_tail(&folio->lru, pagelist); - node_stat_mod_folio(folio, - NR_ISOLATED_ANON + folio_is_file_lru(folio), - folio_nr_pages(folio)); + if (vma && vma_migratable(vma)) { + folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); + if (folio) { + err = __add_folio_for_migration(folio, node, pagelist, + migrate_all); + folio_walk_end(&fw, vma); + } else { + err = -ENOENT; + } } -out_putfolio: - /* - * Either remove the duplicate refcount from folio_isolate_lru() - * or drop the folio ref if it was not isolated. - */ - folio_put(folio); -out: mmap_read_unlock(mm); return err; } @@ -2274,8 +2343,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, * Errors in the page lookup or isolation are not fatal and we simply * report them via status */ - err = add_page_for_migration(mm, p, current_node, &pagelist, - flags & MPOL_MF_MOVE_ALL); + err = add_folio_for_migration(mm, p, current_node, &pagelist, + flags & MPOL_MF_MOVE_ALL); if (err > 0) { /* The page is successfully queued for migration */ @@ -2331,28 +2400,26 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); struct vm_area_struct *vma; - struct page *page; + struct folio_walk fw; + struct folio *folio; int err = -EFAULT; vma = vma_lookup(mm, addr); if (!vma) goto set_status; - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; - - err = -ENOENT; - if (!page) - goto set_status; - - if (!is_zone_device_page(page)) - err = page_to_nid(page); - - put_page(page); + folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); + if (folio) { + if (is_zero_folio(folio) || is_huge_zero_folio(folio)) + err = -EFAULT; + else if (folio_is_zone_device(folio)) + err = -ENOENT; + else + err = folio_nid(folio); + folio_walk_end(&fw, vma); + } else { + err = -ENOENT; + } set_status: *status = err; @@ -2432,25 +2499,19 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) return current->mm; } - /* Find the mm_struct */ - rcu_read_lock(); - task = find_task_by_vpid(pid); + task = find_get_task_by_vpid(pid); if (!task) { - rcu_read_unlock(); return ERR_PTR(-ESRCH); } - get_task_struct(task); /* * Check if this process has the right to modify the specified * process. Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { - rcu_read_unlock(); mm = ERR_PTR(-EPERM); goto out; } - rcu_read_unlock(); mm = ERR_PTR(security_task_movememory(task)); if (IS_ERR(mm)) @@ -2526,7 +2587,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, - ZONE_MOVABLE, 0)) + ZONE_MOVABLE, ALLOC_CMA)) continue; return true; } @@ -2627,6 +2688,8 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int nr_remaining; unsigned int nr_succeeded; LIST_HEAD(migratepages); + struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); list_add(&folio->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, @@ -2636,10 +2699,13 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, putback_movable_pages(&migratepages); if (nr_succeeded) { count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); - if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node)) - mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, - nr_succeeded); + count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded); + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) + && !node_is_toptier(folio_nid(folio)) + && node_is_toptier(node)) + mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded); } + mem_cgroup_put(memcg); BUG_ON(!list_empty(&migratepages)); return nr_remaining ? -EAGAIN : 0; } diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 6d66dc1c6ffa..9cf26592ac93 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -328,8 +328,8 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) /* * One extra ref because caller holds an extra reference, either from - * isolate_lru_page() for a regular page, or migrate_vma_collect() for - * a device page. + * folio_isolate_lru() for a regular folio, or migrate_vma_collect() for + * a device folio. */ int extra = 1 + (page == fault_page); @@ -379,33 +379,33 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, continue; } - /* ZONE_DEVICE pages are not on LRU */ - if (!is_zone_device_page(page)) { - if (!PageLRU(page) && allow_drain) { + folio = page_folio(page); + /* ZONE_DEVICE folios are not on LRU */ + if (!folio_is_zone_device(folio)) { + if (!folio_test_lru(folio) && allow_drain) { /* Drain CPU's lru cache */ lru_add_drain_all(); allow_drain = false; } - if (!isolate_lru_page(page)) { + if (!folio_isolate_lru(folio)) { src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } /* Drop the reference we took in collect */ - put_page(page); + folio_put(folio); } - folio = page_folio(page); if (folio_mapped(folio)) try_to_migrate(folio, 0); - if (page_mapped(page) || + if (folio_mapped(folio) || !migrate_vma_check_page(page, fault_page)) { - if (!is_zone_device_page(page)) { - get_page(page); - putback_lru_page(page); + if (!folio_is_zone_device(folio)) { + folio_get(folio); + folio_putback_lru(folio); } src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; @@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, continue; folio = page_folio(page); - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, 0); src_pfns[i] = 0; folio_unlock(folio); @@ -708,7 +708,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, /* * The only time there is no vma is when called from - * migrate_device_coherent_page(). However this isn't + * migrate_device_coherent_folio(). However this isn't * called if the page could not be unmapped. */ VM_BUG_ON(!migrate); @@ -815,42 +815,45 @@ void migrate_device_finalize(unsigned long *src_pfns, unsigned long i; for (i = 0; i < npages; i++) { - struct folio *dst, *src; + struct folio *dst = NULL, *src = NULL; struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); struct page *page = migrate_pfn_to_page(src_pfns[i]); + if (newpage) + dst = page_folio(newpage); + if (!page) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); + if (dst) { + folio_unlock(dst); + folio_put(dst); } continue; } - if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) { - if (newpage) { - unlock_page(newpage); - put_page(newpage); + src = page_folio(page); + + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) { + if (dst) { + folio_unlock(dst); + folio_put(dst); } - newpage = page; + dst = src; } - src = page_folio(page); - dst = page_folio(newpage); - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, 0); folio_unlock(src); - if (is_zone_device_page(page)) - put_page(page); + if (folio_is_zone_device(src)) + folio_put(src); else - putback_lru_page(page); + folio_putback_lru(src); - if (newpage != page) { - unlock_page(newpage); - if (is_zone_device_page(newpage)) - put_page(newpage); + if (dst != src) { + folio_unlock(dst); + if (folio_is_zone_device(dst)) + folio_put(dst); else - putback_lru_page(newpage); + folio_putback_lru(dst); } } } @@ -898,16 +901,17 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start, unsigned long i, pfn; for (pfn = start, i = 0; i < npages; pfn++, i++) { - struct page *page = pfn_to_page(pfn); + struct folio *folio; - if (!get_page_unless_zero(page)) { + folio = folio_get_nontail_page(pfn_to_page(pfn)); + if (!folio) { src_pfns[i] = 0; continue; } - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { src_pfns[i] = 0; - put_page(page); + folio_put(folio); continue; } @@ -921,38 +925,38 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start, EXPORT_SYMBOL(migrate_device_range); /* - * Migrate a device coherent page back to normal memory. The caller should have - * a reference on page which will be copied to the new page if migration is + * Migrate a device coherent folio back to normal memory. The caller should have + * a reference on folio which will be copied to the new folio if migration is * successful or dropped on failure. */ -int migrate_device_coherent_page(struct page *page) +int migrate_device_coherent_folio(struct folio *folio) { unsigned long src_pfn, dst_pfn = 0; - struct page *dpage; + struct folio *dfolio; - WARN_ON_ONCE(PageCompound(page)); + WARN_ON_ONCE(folio_test_large(folio)); - lock_page(page); - src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; + folio_lock(folio); + src_pfn = migrate_pfn(folio_pfn(folio)) | MIGRATE_PFN_MIGRATE; /* * We don't have a VMA and don't need to walk the page tables to find - * the source page. So call migrate_vma_unmap() directly to unmap the - * page as migrate_vma_setup() will fail if args.vma == NULL. + * the source folio. So call migrate_vma_unmap() directly to unmap the + * folio as migrate_vma_setup() will fail if args.vma == NULL. */ migrate_device_unmap(&src_pfn, 1, NULL); if (!(src_pfn & MIGRATE_PFN_MIGRATE)) return -EBUSY; - dpage = alloc_page(GFP_USER | __GFP_NOWARN); - if (dpage) { - lock_page(dpage); - dst_pfn = migrate_pfn(page_to_pfn(dpage)); + dfolio = folio_alloc(GFP_USER | __GFP_NOWARN, 0); + if (dfolio) { + folio_lock(dfolio); + dst_pfn = migrate_pfn(folio_pfn(dfolio)); } migrate_device_pages(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) - copy_highpage(dpage, page); + folio_copy(dfolio, folio); migrate_device_finalize(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) diff --git a/mm/mm_init.c b/mm/mm_init.c index 51960079875b..4ba5607aaf19 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1835,14 +1835,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) for_each_node(nid) { pg_data_t *pgdat; - if (!node_online(nid)) { - /* Allocator not initialized yet */ - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - arch_refresh_nodedata(nid, pgdat); - } + if (!node_online(nid)) + alloc_offline_node_data(nid); pgdat = NODE_DATA(nid); free_area_init_node(nid); @@ -1939,7 +1933,7 @@ static void __init deferred_free_pages(unsigned long pfn, } /* Accept chunks smaller than MAX_PAGE_ORDER upfront */ - accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); + accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE); for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) diff --git a/mm/mmap.c b/mm/mmap.c index 6ddb278a5ee8..ee8f91eaadb9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -76,16 +76,6 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, struct vm_area_struct *prev, - struct vm_area_struct *next, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked); - -static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) -{ - return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); -} - /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { @@ -102,100 +92,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) } /* - * Requires inode->i_mapping->i_mmap_rwsem - */ -static void __remove_shared_vm_struct(struct vm_area_struct *vma, - struct address_space *mapping) -{ - if (vma_is_shared_maywrite(vma)) - mapping_unmap_writable(mapping); - - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); -} - -/* - * Unlink a file-based vm structure from its interval tree, to hide - * vma from rmap and vmtruncate before freeing its page tables. - */ -void unlink_file_vma(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - - if (file) { - struct address_space *mapping = file->f_mapping; - i_mmap_lock_write(mapping); - __remove_shared_vm_struct(vma, mapping); - i_mmap_unlock_write(mapping); - } -} - -void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) -{ - vb->count = 0; -} - -static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) -{ - struct address_space *mapping; - int i; - - mapping = vb->vmas[0]->vm_file->f_mapping; - i_mmap_lock_write(mapping); - for (i = 0; i < vb->count; i++) { - VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); - __remove_shared_vm_struct(vb->vmas[i], mapping); - } - i_mmap_unlock_write(mapping); - - unlink_file_vma_batch_init(vb); -} - -void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, - struct vm_area_struct *vma) -{ - if (vma->vm_file == NULL) - return; - - if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || - vb->count == ARRAY_SIZE(vb->vmas)) - unlink_file_vma_batch_process(vb); - - vb->vmas[vb->count] = vma; - vb->count++; -} - -void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) -{ - if (vb->count > 0) - unlink_file_vma_batch_process(vb); -} - -/* - * Close a vm structure and free it. - */ -static void remove_vma(struct vm_area_struct *vma, bool unreachable) -{ - might_sleep(); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); - mpol_put(vma_policy(vma)); - if (unreachable) - __vm_area_free(vma); - else - vm_area_free(vma); -} - -static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, - unsigned long min) -{ - return mas_prev(&vmi->mas, min); -} - -/* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. * @addr: The address to check @@ -273,11 +169,12 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. - * do_vma_munmap() will drop the lock on success, so update it - * before calling do_vma_munmap(). + * do_vmi_align_munmap() will drop the lock on success, so + * update it before calling do_vma_munmap(). */ mm->brk = brk; - if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true)) + if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf, + /* unlock = */ true)) goto out; goto success_unlocked; @@ -318,875 +215,6 @@ out: return origbrk; } -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) -static void validate_mm(struct mm_struct *mm) -{ - int bug = 0; - int i = 0; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mt_validate(&mm->mm_mt); - for_each_vma(vmi, vma) { -#ifdef CONFIG_DEBUG_VM_RB - struct anon_vma *anon_vma = vma->anon_vma; - struct anon_vma_chain *avc; -#endif - unsigned long vmi_start, vmi_end; - bool warn = 0; - - vmi_start = vma_iter_addr(&vmi); - vmi_end = vma_iter_end(&vmi); - if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) - warn = 1; - - if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) - warn = 1; - - if (warn) { - pr_emerg("issue in %s\n", current->comm); - dump_stack(); - dump_vma(vma); - pr_emerg("tree range: %px start %lx end %lx\n", vma, - vmi_start, vmi_end - 1); - vma_iter_dump_tree(&vmi); - } - -#ifdef CONFIG_DEBUG_VM_RB - if (anon_vma) { - anon_vma_lock_read(anon_vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - anon_vma_unlock_read(anon_vma); - } -#endif - i++; - } - if (i != mm->map_count) { - pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); - bug = 1; - } - VM_BUG_ON_MM(bug, mm); -} - -#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ -#define validate_mm(mm) do { } while (0) -#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ - -/* - * vma has some anon_vma assigned, and is already inserted on that - * anon_vma's interval trees. - * - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the - * vma must be removed from the anon_vma's interval trees using - * anon_vma_interval_tree_pre_update_vma(). - * - * After the update, the vma will be reinserted using - * anon_vma_interval_tree_post_update_vma(). - * - * The entire update must be protected by exclusive mmap_lock and by - * the root anon_vma's mutex. - */ -static inline void -anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); -} - -static inline void -anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) -{ - struct anon_vma_chain *avc; - - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); -} - -static unsigned long count_vma_pages_range(struct mm_struct *mm, - unsigned long addr, unsigned long end) -{ - VMA_ITERATOR(vmi, mm, addr); - struct vm_area_struct *vma; - unsigned long nr_pages = 0; - - for_each_vma_range(vmi, vma, end) { - unsigned long vm_start = max(addr, vma->vm_start); - unsigned long vm_end = min(end, vma->vm_end); - - nr_pages += PHYS_PFN(vm_end - vm_start); - } - - return nr_pages; -} - -static void __vma_link_file(struct vm_area_struct *vma, - struct address_space *mapping) -{ - if (vma_is_shared_maywrite(vma)) - mapping_allow_writable(mapping); - - flush_dcache_mmap_lock(mapping); - vma_interval_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); -} - -static void vma_link_file(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct address_space *mapping; - - if (file) { - mapping = file->f_mapping; - i_mmap_lock_write(mapping); - __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); - } -} - -static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) -{ - VMA_ITERATOR(vmi, mm, 0); - - vma_iter_config(&vmi, vma->vm_start, vma->vm_end); - if (vma_iter_prealloc(&vmi, vma)) - return -ENOMEM; - - vma_start_write(vma); - vma_iter_store(&vmi, vma); - vma_link_file(vma); - mm->map_count++; - validate_mm(mm); - return 0; -} - -/* - * init_multi_vma_prep() - Initializer for struct vma_prepare - * @vp: The vma_prepare struct - * @vma: The vma that will be altered once locked - * @next: The next vma if it is to be adjusted - * @remove: The first vma to be removed - * @remove2: The second vma to be removed - */ -static inline void init_multi_vma_prep(struct vma_prepare *vp, - struct vm_area_struct *vma, struct vm_area_struct *next, - struct vm_area_struct *remove, struct vm_area_struct *remove2) -{ - memset(vp, 0, sizeof(struct vma_prepare)); - vp->vma = vma; - vp->anon_vma = vma->anon_vma; - vp->remove = remove; - vp->remove2 = remove2; - vp->adj_next = next; - if (!vp->anon_vma && next) - vp->anon_vma = next->anon_vma; - - vp->file = vma->vm_file; - if (vp->file) - vp->mapping = vma->vm_file->f_mapping; - -} - -/* - * init_vma_prep() - Initializer wrapper for vma_prepare struct - * @vp: The vma_prepare struct - * @vma: The vma that will be altered once locked - */ -static inline void init_vma_prep(struct vma_prepare *vp, - struct vm_area_struct *vma) -{ - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); -} - - -/* - * vma_prepare() - Helper function for handling locking VMAs prior to altering - * @vp: The initialized vma_prepare struct - */ -static inline void vma_prepare(struct vma_prepare *vp) -{ - if (vp->file) { - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); - - if (vp->adj_next) - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, - vp->adj_next->vm_end); - - i_mmap_lock_write(vp->mapping); - if (vp->insert && vp->insert->vm_file) { - /* - * Put into interval tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(vp->insert, - vp->insert->vm_file->f_mapping); - } - } - - if (vp->anon_vma) { - anon_vma_lock_write(vp->anon_vma); - anon_vma_interval_tree_pre_update_vma(vp->vma); - if (vp->adj_next) - anon_vma_interval_tree_pre_update_vma(vp->adj_next); - } - - if (vp->file) { - flush_dcache_mmap_lock(vp->mapping); - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); - if (vp->adj_next) - vma_interval_tree_remove(vp->adj_next, - &vp->mapping->i_mmap); - } - -} - -/* - * vma_complete- Helper function for handling the unlocking after altering VMAs, - * or for inserting a VMA. - * - * @vp: The vma_prepare struct - * @vmi: The vma iterator - * @mm: The mm_struct - */ -static inline void vma_complete(struct vma_prepare *vp, - struct vma_iterator *vmi, struct mm_struct *mm) -{ - if (vp->file) { - if (vp->adj_next) - vma_interval_tree_insert(vp->adj_next, - &vp->mapping->i_mmap); - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); - flush_dcache_mmap_unlock(vp->mapping); - } - - if (vp->remove && vp->file) { - __remove_shared_vm_struct(vp->remove, vp->mapping); - if (vp->remove2) - __remove_shared_vm_struct(vp->remove2, vp->mapping); - } else if (vp->insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - vma_iter_store(vmi, vp->insert); - mm->map_count++; - } - - if (vp->anon_vma) { - anon_vma_interval_tree_post_update_vma(vp->vma); - if (vp->adj_next) - anon_vma_interval_tree_post_update_vma(vp->adj_next); - anon_vma_unlock_write(vp->anon_vma); - } - - if (vp->file) { - i_mmap_unlock_write(vp->mapping); - uprobe_mmap(vp->vma); - - if (vp->adj_next) - uprobe_mmap(vp->adj_next); - } - - if (vp->remove) { -again: - vma_mark_detached(vp->remove, true); - if (vp->file) { - uprobe_munmap(vp->remove, vp->remove->vm_start, - vp->remove->vm_end); - fput(vp->file); - } - if (vp->remove->anon_vma) - anon_vma_merge(vp->vma, vp->remove); - mm->map_count--; - mpol_put(vma_policy(vp->remove)); - if (!vp->remove2) - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); - vm_area_free(vp->remove); - - /* - * In mprotect's case 6 (see comments on vma_merge), - * we are removing both mid and next vmas - */ - if (vp->remove2) { - vp->remove = vp->remove2; - vp->remove2 = NULL; - goto again; - } - } - if (vp->insert && vp->file) - uprobe_mmap(vp->insert); - validate_mm(mm); -} - -/* - * dup_anon_vma() - Helper function to duplicate anon_vma - * @dst: The destination VMA - * @src: The source VMA - * @dup: Pointer to the destination VMA when successful. - * - * Returns: 0 on success. - */ -static inline int dup_anon_vma(struct vm_area_struct *dst, - struct vm_area_struct *src, struct vm_area_struct **dup) -{ - /* - * Easily overlooked: when mprotect shifts the boundary, make sure the - * expanding vma has anon_vma set if the shrinking vma had, to cover any - * anon pages imported. - */ - if (src->anon_vma && !dst->anon_vma) { - int ret; - - vma_assert_write_locked(dst); - dst->anon_vma = src->anon_vma; - ret = anon_vma_clone(dst, src); - if (ret) - return ret; - - *dup = dst; - } - - return 0; -} - -/* - * vma_expand - Expand an existing VMA - * - * @vmi: The vma iterator - * @vma: The vma to expand - * @start: The start of the vma - * @end: The exclusive end of the vma - * @pgoff: The page offset of vma - * @next: The current of next vma. - * - * Expand @vma to @start and @end. Can expand off the start and end. Will - * expand over @next if it's different from @vma and @end == @next->vm_end. - * Checking if the @vma can expand and merge with @next needs to be handled by - * the caller. - * - * Returns: 0 on success - */ -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) -{ - struct vm_area_struct *anon_dup = NULL; - bool remove_next = false; - struct vma_prepare vp; - - vma_start_write(vma); - if (next && (vma != next) && (end == next->vm_end)) { - int ret; - - remove_next = true; - vma_start_write(next); - ret = dup_anon_vma(vma, next, &anon_dup); - if (ret) - return ret; - } - - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); - /* Not merging but overwriting any part of next is not handled. */ - VM_WARN_ON(next && !vp.remove && - next != vma && end > next->vm_start); - /* Only handles expanding */ - VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); - - /* Note: vma iterator must be pointing to 'start' */ - vma_iter_config(vmi, start, end); - if (vma_iter_prealloc(vmi, vma)) - goto nomem; - - vma_prepare(&vp); - vma_adjust_trans_huge(vma, start, end, 0); - vma_set_range(vma, start, end, pgoff); - vma_iter_store(vmi, vma); - - vma_complete(&vp, vmi, vma->vm_mm); - return 0; - -nomem: - if (anon_dup) - unlink_anon_vmas(anon_dup); - return -ENOMEM; -} - -/* - * vma_shrink() - Reduce an existing VMAs memory area - * @vmi: The vma iterator - * @vma: The VMA to modify - * @start: The new start - * @end: The new end - * - * Returns: 0 on success, -ENOMEM otherwise - */ -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff) -{ - struct vma_prepare vp; - - WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); - - if (vma->vm_start < start) - vma_iter_config(vmi, vma->vm_start, start); - else - vma_iter_config(vmi, end, vma->vm_end); - - if (vma_iter_prealloc(vmi, NULL)) - return -ENOMEM; - - vma_start_write(vma); - - init_vma_prep(&vp, vma); - vma_prepare(&vp); - vma_adjust_trans_huge(vma, start, end, 0); - - vma_iter_clear(vmi); - vma_set_range(vma, start, end, pgoff); - vma_complete(&vp, vmi, vma->vm_mm); - return 0; -} - -/* - * If the vma has a ->close operation then the driver probably needs to release - * per-vma resources, so we don't attempt to merge those if the caller indicates - * the current vma may be removed as part of the merge. - */ -static inline bool is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name, bool may_remove_vma) -{ - /* - * VM_SOFTDIRTY should not prevent from VMA merging, if we - * match the flags but dirty bit -- the caller should mark - * merged VMA as dirty. If dirty bit won't be excluded from - * comparison, we increase pressure on the memory system forcing - * the kernel to generate new VMAs when old one could be - * extended instead. - */ - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) - return false; - if (vma->vm_file != file) - return false; - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) - return false; - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) - return false; - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) - return false; - return true; -} - -static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2, struct vm_area_struct *vma) -{ - /* - * The list_is_singular() test is to avoid merging VMA cloned from - * parents. This can improve scalability caused by anon_vma lock. - */ - if ((!anon_vma1 || !anon_vma2) && (!vma || - list_is_singular(&vma->anon_vma_chain))) - return true; - return anon_vma1 == anon_vma2; -} - -/* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) - * in front of (at a lower virtual address and file offset than) the vma. - * - * We cannot merge two vmas if they have differently assigned (non-NULL) - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. - * - * We don't check here for the merged mmap wrapping around the end of pagecache - * indices (16TB on ia32) because do_mmap() does not permit mmap's which - * wrap, nor mmaps which cover the final page at index -1UL. - * - * We assume the vma may be removed as part of the merge. - */ -static bool -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { - if (vma->vm_pgoff == vm_pgoff) - return true; - } - return false; -} - -/* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) - * beyond (at a higher virtual address and file offset than) the vma. - * - * We cannot merge two vmas if they have differently assigned (non-NULL) - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. - * - * We assume that vma is not removed as part of the merge. - */ -static bool -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { - pgoff_t vm_pglen; - vm_pglen = vma_pages(vma); - if (vma->vm_pgoff + vm_pglen == vm_pgoff) - return true; - } - return false; -} - -/* - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), - * figure out whether that can be merged with its predecessor or its - * successor. Or both (it neatly fills a hole). - * - * In most cases - when called for mmap, brk or mremap - [addr,end) is - * certain not to be mapped by the time vma_merge is called; but when - * called for mprotect, it is certain to be already mapped (either at - * an offset within prev, or at the start of next), and the flags of - * this area are about to be changed to vm_flags - and the no-change - * case has already been eliminated. - * - * The following mprotect cases have to be considered, where **** is - * the area passed down from mprotect_fixup, never extending beyond one - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts - * at the same address as **** and is of the same or larger span, and - * NNNN the next vma after ****: - * - * **** **** **** - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC - * cannot merge might become might become - * PPNNNNNNNNNN PPPPPPPPPPCC - * mmap, brk or case 4 below case 5 below - * mremap move: - * **** **** - * PPPP NNNN PPPPCCCCNNNN - * might become might become - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 - * - * It is important for case 8 that the vma CCCC overlapping the - * region **** is never going to extended over NNNN. Instead NNNN must - * be extended in region **** and CCCC must be removed. This way in - * all cases where vma_merge succeeds, the moment vma_merge drops the - * rmap_locks, the properties of the merged vma will be already - * correct for the whole merged range. Some of those properties like - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must - * be correct for the whole merged range immediately after the - * rmap_locks are released. Otherwise if NNNN would be removed and - * CCCC would be extended over the NNNN range, remove_migration_ptes - * or other rmap walkers (if working on addresses beyond the "end" - * parameter) may establish ptes with the wrong permissions of CCCC - * instead of the right permissions of NNNN. - * - * In the code below: - * PPPP is represented by *prev - * CCCC is represented by *curr or not represented at all (NULL) - * NNNN is represented by *next or not represented at all (NULL) - * **** is not represented - it will be merged and the vma containing the - * area is returned, or the function will return NULL - */ -static struct vm_area_struct -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *src, unsigned long addr, unsigned long end, - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) -{ - struct mm_struct *mm = src->vm_mm; - struct anon_vma *anon_vma = src->anon_vma; - struct file *file = src->vm_file; - struct vm_area_struct *curr, *next, *res; - struct vm_area_struct *vma, *adjust, *remove, *remove2; - struct vm_area_struct *anon_dup = NULL; - struct vma_prepare vp; - pgoff_t vma_pgoff; - int err = 0; - bool merge_prev = false; - bool merge_next = false; - bool vma_expanded = false; - unsigned long vma_start = addr; - unsigned long vma_end = end; - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - long adj_start = 0; - - /* - * We later require that vma->vm_flags == vm_flags, - * so this tests vma->vm_flags & VM_SPECIAL, too. - */ - if (vm_flags & VM_SPECIAL) - return NULL; - - /* Does the input range span an existing VMA? (cases 5 - 8) */ - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); - - if (!curr || /* cases 1 - 4 */ - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ - next = vma_lookup(mm, end); - else - next = NULL; /* case 5 */ - - if (prev) { - vma_start = prev->vm_start; - vma_pgoff = prev->vm_pgoff; - - /* Can we merge the predecessor? */ - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) - && can_vma_merge_after(prev, vm_flags, anon_vma, file, - pgoff, vm_userfaultfd_ctx, anon_name)) { - merge_prev = true; - vma_prev(vmi); - } - } - - /* Can we merge the successor? */ - if (next && mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx, anon_name)) { - merge_next = true; - } - - /* Verify some invariant that must be enforced by the caller. */ - VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); - VM_WARN_ON(addr >= end); - - if (!merge_prev && !merge_next) - return NULL; /* Not mergeable. */ - - if (merge_prev) - vma_start_write(prev); - - res = vma = prev; - remove = remove2 = adjust = NULL; - - /* Can we merge both the predecessor and the successor? */ - if (merge_prev && merge_next && - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { - vma_start_write(next); - remove = next; /* case 1 */ - vma_end = next->vm_end; - err = dup_anon_vma(prev, next, &anon_dup); - if (curr) { /* case 6 */ - vma_start_write(curr); - remove = curr; - remove2 = next; - /* - * Note that the dup_anon_vma below cannot overwrite err - * since the first caller would do nothing unless next - * has an anon_vma. - */ - if (!next->anon_vma) - err = dup_anon_vma(prev, curr, &anon_dup); - } - } else if (merge_prev) { /* case 2 */ - if (curr) { - vma_start_write(curr); - if (end == curr->vm_end) { /* case 7 */ - /* - * can_vma_merge_after() assumed we would not be - * removing prev vma, so it skipped the check - * for vm_ops->close, but we are removing curr - */ - if (curr->vm_ops && curr->vm_ops->close) - err = -EINVAL; - remove = curr; - } else { /* case 5 */ - adjust = curr; - adj_start = (end - curr->vm_start); - } - if (!err) - err = dup_anon_vma(prev, curr, &anon_dup); - } - } else { /* merge_next */ - vma_start_write(next); - res = next; - if (prev && addr < prev->vm_end) { /* case 4 */ - vma_start_write(prev); - vma_end = addr; - adjust = next; - adj_start = -(prev->vm_end - addr); - err = dup_anon_vma(next, prev, &anon_dup); - } else { - /* - * Note that cases 3 and 8 are the ONLY ones where prev - * is permitted to be (but is not necessarily) NULL. - */ - vma = next; /* case 3 */ - vma_start = addr; - vma_end = next->vm_end; - vma_pgoff = next->vm_pgoff - pglen; - if (curr) { /* case 8 */ - vma_pgoff = curr->vm_pgoff; - vma_start_write(curr); - remove = curr; - err = dup_anon_vma(next, curr, &anon_dup); - } - } - } - - /* Error in anon_vma clone. */ - if (err) - goto anon_vma_fail; - - if (vma_start < vma->vm_start || vma_end > vma->vm_end) - vma_expanded = true; - - if (vma_expanded) { - vma_iter_config(vmi, vma_start, vma_end); - } else { - vma_iter_config(vmi, adjust->vm_start + adj_start, - adjust->vm_end); - } - - if (vma_iter_prealloc(vmi, vma)) - goto prealloc_fail; - - init_multi_vma_prep(&vp, vma, adjust, remove, remove2); - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && - vp.anon_vma != adjust->anon_vma); - - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); - vma_set_range(vma, vma_start, vma_end, vma_pgoff); - - if (vma_expanded) - vma_iter_store(vmi, vma); - - if (adj_start) { - adjust->vm_start += adj_start; - adjust->vm_pgoff += adj_start >> PAGE_SHIFT; - if (adj_start < 0) { - WARN_ON(vma_expanded); - vma_iter_store(vmi, next); - } - } - - vma_complete(&vp, vmi, mm); - khugepaged_enter_vma(res, vm_flags); - return res; - -prealloc_fail: - if (anon_dup) - unlink_anon_vmas(anon_dup); - -anon_vma_fail: - vma_iter_set(vmi, addr); - vma_iter_load(vmi); - return NULL; -} - -/* - * Rough compatibility check to quickly see if it's even worth looking - * at sharing an anon_vma. - * - * They need to have the same vm_file, and the flags can only differ - * in things that mprotect may change. - * - * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that - * we can merge the two vma's. For example, we refuse to merge a vma if - * there is a vm_ops->close() function, because that indicates that the - * driver is doing some kind of reference counting. But that doesn't - * really matter for the anon_vma sharing case. - */ -static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) -{ - return a->vm_end == b->vm_start && - mpol_equal(vma_policy(a), vma_policy(b)) && - a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && - b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); -} - -/* - * Do some basic sanity checking to see if we can re-use the anon_vma - * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be - * the same as 'old', the other will be the new one that is trying - * to share the anon_vma. - * - * NOTE! This runs with mmap_lock held for reading, so it is possible that - * the anon_vma of 'old' is concurrently in the process of being set up - * by another page fault trying to merge _that_. But that's ok: if it - * is being set up, that automatically means that it will be a singleton - * acceptable for merging, so we can do all of this optimistically. But - * we do that READ_ONCE() to make sure that we never re-load the pointer. - * - * IOW: that the "list_is_singular()" test on the anon_vma_chain only - * matters for the 'stable anon_vma' case (ie the thing we want to avoid - * is to return an anon_vma that is "complex" due to having gone through - * a fork). - * - * We also make sure that the two vma's are compatible (adjacent, - * and with the same memory policies). That's all stable, even with just - * a read lock on the mmap_lock. - */ -static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) -{ - if (anon_vma_compatible(a, b)) { - struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); - - if (anon_vma && list_is_singular(&old->anon_vma_chain)) - return anon_vma; - } - return NULL; -} - -/* - * find_mergeable_anon_vma is used by anon_vma_prepare, to check - * neighbouring vmas for a suitable anon_vma, before it goes off - * to allocate a new anon_vma. It checks because a repetitive - * sequence of mprotects and faults may otherwise lead to distinct - * anon_vmas being allocated, preventing vma merge in subsequent - * mprotect. - */ -struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) -{ - struct anon_vma *anon_vma = NULL; - struct vm_area_struct *prev, *next; - VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); - - /* Try next first. */ - next = vma_iter_load(&vmi); - if (next) { - anon_vma = reusable_anon_vma(next, vma, next); - if (anon_vma) - return anon_vma; - } - - prev = vma_prev(&vmi); - VM_BUG_ON_VMA(prev != vma, vma); - prev = vma_prev(&vmi); - /* Try prev next. */ - if (prev) - anon_vma = reusable_anon_vma(prev, prev, vma); - - /* - * We might reach here with anon_vma == NULL if we can't find - * any reusable anon_vma. - * There's no absolute need to look only at touching neighbours: - * we could search further afield for "compatible" anon_vmas. - * But it would probably just be a waste of time searching, - * or lead to too many vmas hanging off the same anon_vma. - * We're trying to allow mprotect remerging later on, - * not trying to minimize memory used for anon_vmas. - */ - return anon_vma; -} - /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -1549,85 +577,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ -static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) -{ - return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); -} - -static bool vma_is_shared_writable(struct vm_area_struct *vma) -{ - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == - (VM_WRITE | VM_SHARED); -} - -static bool vma_fs_can_writeback(struct vm_area_struct *vma) -{ - /* No managed pages to writeback. */ - if (vma->vm_flags & VM_PFNMAP) - return false; - - return vma->vm_file && vma->vm_file->f_mapping && - mapping_can_writeback(vma->vm_file->f_mapping); -} - -/* - * Does this VMA require the underlying folios to have their dirty state - * tracked? - */ -bool vma_needs_dirty_tracking(struct vm_area_struct *vma) -{ - /* Only shared, writable VMAs require dirty tracking. */ - if (!vma_is_shared_writable(vma)) - return false; - - /* Does the filesystem need to be notified? */ - if (vm_ops_needs_writenotify(vma->vm_ops)) - return true; - - /* - * Even if the filesystem doesn't indicate a need for writenotify, if it - * can writeback, dirty tracking is still required. - */ - return vma_fs_can_writeback(vma); -} - -/* - * Some shared mappings will want the pages marked read-only - * to track write events. If so, we'll downgrade vm_page_prot - * to the private version (using protection_map[] without the - * VM_SHARED bit). - */ -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) -{ - /* If it was private or non-writable, the write bit is already clear */ - if (!vma_is_shared_writable(vma)) - return false; - - /* The backer wishes to know when pages are first written to? */ - if (vm_ops_needs_writenotify(vma->vm_ops)) - return true; - - /* The open routine did something to the protections that pgprot_modify - * won't preserve? */ - if (pgprot_val(vm_page_prot) != - pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) - return false; - - /* - * Do we need to track softdirty? hugetlb does not support softdirty - * tracking yet. - */ - if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) - return true; - - /* Do we need write faults for uffd-wp tracking? */ - if (userfaultfd_wp(vma)) - return true; - - /* Can the mapping track the dirty pages? */ - return vma_fs_can_writeback(vma); -} - /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. @@ -1754,6 +703,18 @@ retry: } /* + * Determine if the allocation needs to ensure that there is no + * existing mapping within it's guard gaps, for use as start_gap. + */ +static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) +{ + if (vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + +/* * Search for an unmapped address range. * * We are looking for a range that: @@ -1789,7 +750,7 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1814,6 +775,7 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; + info.start_gap = stack_guard_placement(vm_flags); return vm_unmapped_area(&info); } @@ -1821,9 +783,10 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { - return generic_get_unmapped_area(filp, addr, len, pgoff, flags); + return generic_get_unmapped_area(filp, addr, len, pgoff, flags, + vm_flags); } #endif @@ -1834,7 +797,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; @@ -1862,6 +825,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); + info.start_gap = stack_guard_placement(vm_flags); addr = vm_unmapped_area(&info); /* @@ -1885,26 +849,10 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) + unsigned long flags, vm_flags_t vm_flags) { - return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); -} -#endif - -#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS -unsigned long -arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) -{ - return arch_get_unmapped_area(filp, addr, len, pgoff, flags); -} - -unsigned long -arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t vm_flags) -{ - return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); + return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, + vm_flags); } #endif @@ -1914,9 +862,9 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi vm_flags_t vm_flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) - return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, - flags, vm_flags); - return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags); + return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, + flags, vm_flags); + return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } unsigned long @@ -1978,8 +926,8 @@ mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long pgoff, unsigned long flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) - return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags); - return arch_get_unmapped_area(file, addr, len, pgoff, flags); + return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0); + return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); @@ -2393,443 +1341,6 @@ success: return vma; } -/* - * Ok - we have the memory areas we should free on a maple tree so release them, - * and do the vma updates. - * - * Called with the mm semaphore held. - */ -static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) -{ - unsigned long nr_accounted = 0; - struct vm_area_struct *vma; - - /* Update high watermark before we lower total_vm */ - update_hiwater_vm(mm); - mas_for_each(mas, vma, ULONG_MAX) { - long nrpages = vma_pages(vma); - - if (vma->vm_flags & VM_ACCOUNT) - nr_accounted += nrpages; - vm_stat_account(mm, vma->vm_flags, -nrpages); - remove_vma(vma, false); - } - vm_unacct_memory(nr_accounted); -} - -/* - * Get rid of page table information in the indicated region. - * - * Called with the mm semaphore held. - */ -static void unmap_region(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, struct vm_area_struct *prev, - struct vm_area_struct *next, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked) -{ - struct mmu_gather tlb; - unsigned long mt_start = mas->index; - - lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - update_hiwater_rss(mm); - unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); - mas_set(mas, mt_start); - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : USER_PGTABLES_CEILING, - mm_wr_locked); - tlb_finish_mmu(&tlb); -} - -/* - * __split_vma() bypasses sysctl_max_map_count checking. We use this where it - * has already been checked or doesn't make sense to fail. - * VMA Iterator will point to the end VMA. - */ -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) -{ - struct vma_prepare vp; - struct vm_area_struct *new; - int err; - - WARN_ON(vma->vm_start >= addr); - WARN_ON(vma->vm_end <= addr); - - if (vma->vm_ops && vma->vm_ops->may_split) { - err = vma->vm_ops->may_split(vma, addr); - if (err) - return err; - } - - new = vm_area_dup(vma); - if (!new) - return -ENOMEM; - - if (new_below) { - new->vm_end = addr; - } else { - new->vm_start = addr; - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); - } - - err = -ENOMEM; - vma_iter_config(vmi, new->vm_start, new->vm_end); - if (vma_iter_prealloc(vmi, new)) - goto out_free_vma; - - err = vma_dup_policy(vma, new); - if (err) - goto out_free_vmi; - - err = anon_vma_clone(new, vma); - if (err) - goto out_free_mpol; - - if (new->vm_file) - get_file(new->vm_file); - - if (new->vm_ops && new->vm_ops->open) - new->vm_ops->open(new); - - vma_start_write(vma); - vma_start_write(new); - - init_vma_prep(&vp, vma); - vp.insert = new; - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); - - if (new_below) { - vma->vm_start = addr; - vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; - } else { - vma->vm_end = addr; - } - - /* vma_complete stores the new vma */ - vma_complete(&vp, vmi, vma->vm_mm); - - /* Success. */ - if (new_below) - vma_next(vmi); - return 0; - -out_free_mpol: - mpol_put(vma_policy(new)); -out_free_vmi: - vma_iter_free(vmi); -out_free_vma: - vm_area_free(new); - return err; -} - -/* - * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the tail. - */ -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) -{ - if (vma->vm_mm->map_count >= sysctl_max_map_count) - return -ENOMEM; - - return __split_vma(vmi, vma, addr, new_below); -} - -/* - * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd - * context and anonymous VMA name within the range [start, end). - * - * As a result, we might be able to merge the newly modified VMA range with an - * adjacent VMA with identical properties. - * - * If no merge is possible and the range does not span the entirety of the VMA, - * we then need to split the VMA to accommodate the change. - * - * The function returns either the merged VMA, the original VMA if a split was - * required instead, or an error if the split failed. - */ -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long vm_flags, - struct mempolicy *policy, - struct vm_userfaultfd_ctx uffd_ctx, - struct anon_vma_name *anon_name) -{ - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - struct vm_area_struct *merged; - - merged = vma_merge(vmi, prev, vma, start, end, vm_flags, - pgoff, policy, uffd_ctx, anon_name); - if (merged) - return merged; - - if (vma->vm_start < start) { - int err = split_vma(vmi, vma, start, 1); - - if (err) - return ERR_PTR(err); - } - - if (vma->vm_end > end) { - int err = split_vma(vmi, vma, end, 0); - - if (err) - return ERR_PTR(err); - } - - return vma; -} - -/* - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller - * must ensure that [start, end) does not overlap any existing VMA. - */ -static struct vm_area_struct -*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff) -{ - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff, - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} - -/* - * Expand vma by delta bytes, potentially merging with an immediately adjacent - * VMA with identical properties. - */ -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta) -{ - pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); - - /* vma is specified as prev, so case 1 or 2 will apply. */ - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta, - vma->vm_flags, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} - -/* - * do_vmi_align_munmap() - munmap the aligned region from @start to @end. - * @vmi: The vma iterator - * @vma: The starting vm_area_struct - * @mm: The mm_struct - * @start: The aligned start address to munmap. - * @end: The aligned end address to munmap. - * @uf: The userfaultfd list_head - * @unlock: Set to true to drop the mmap_lock. unlocking only happens on - * success. - * - * Return: 0 on success and drops the lock if so directed, error and leaves the - * lock held otherwise. - */ -static int -do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - struct mm_struct *mm, unsigned long start, - unsigned long end, struct list_head *uf, bool unlock) -{ - struct vm_area_struct *prev, *next = NULL; - struct maple_tree mt_detach; - int count = 0; - int error = -ENOMEM; - unsigned long locked_vm = 0; - MA_STATE(mas_detach, &mt_detach, 0, 0); - mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); - mt_on_stack(mt_detach); - - /* - * If we need to split any vma, do it now to save pain later. - * - * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially - * unmapped vm_area_struct will remain in use: so lower split_vma - * places tmp vma above, and higher split_vma places tmp vma below. - */ - - /* Does it split the first one? */ - if (start > vma->vm_start) { - - /* - * Make sure that map_count on return from munmap() will - * not exceed its limit; but let map_count go just above - * its limit temporarily, to help free resources as expected. - */ - if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) - goto map_count_exceeded; - - error = __split_vma(vmi, vma, start, 1); - if (error) - goto start_split_failed; - } - - /* - * Detach a range of VMAs from the mm. Using next as a temp variable as - * it is always overwritten. - */ - next = vma; - do { - /* Does it split the end? */ - if (next->vm_end > end) { - error = __split_vma(vmi, next, end, 0); - if (error) - goto end_split_failed; - } - vma_start_write(next); - mas_set(&mas_detach, count); - error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); - if (error) - goto munmap_gather_failed; - vma_mark_detached(next, true); - if (next->vm_flags & VM_LOCKED) - locked_vm += vma_pages(next); - - count++; - if (unlikely(uf)) { - /* - * If userfaultfd_unmap_prep returns an error the vmas - * will remain split, but userland will get a - * highly unexpected error anyway. This is no - * different than the case where the first of the two - * __split_vma fails, but we don't undo the first - * split, despite we could. This is unlikely enough - * failure that it's not worth optimizing it for. - */ - error = userfaultfd_unmap_prep(next, start, end, uf); - - if (error) - goto userfaultfd_error; - } -#ifdef CONFIG_DEBUG_VM_MAPLE_TREE - BUG_ON(next->vm_start < start); - BUG_ON(next->vm_start > end); -#endif - } for_each_vma_range(*vmi, next, end); - -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - /* Make sure no VMAs are about to be lost. */ - { - MA_STATE(test, &mt_detach, 0, 0); - struct vm_area_struct *vma_mas, *vma_test; - int test_count = 0; - - vma_iter_set(vmi, start); - rcu_read_lock(); - vma_test = mas_find(&test, count - 1); - for_each_vma_range(*vmi, vma_mas, end) { - BUG_ON(vma_mas != vma_test); - test_count++; - vma_test = mas_next(&test, count - 1); - } - rcu_read_unlock(); - BUG_ON(count != test_count); - } -#endif - - while (vma_iter_addr(vmi) > start) - vma_iter_prev_range(vmi); - - error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); - if (error) - goto clear_tree_failed; - - /* Point of no return */ - mm->locked_vm -= locked_vm; - mm->map_count -= count; - if (unlock) - mmap_write_downgrade(mm); - - prev = vma_iter_prev_range(vmi); - next = vma_next(vmi); - if (next) - vma_iter_prev_range(vmi); - - /* - * We can free page tables without write-locking mmap_lock because VMAs - * were isolated before we downgraded mmap_lock. - */ - mas_set(&mas_detach, 1); - unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, - !unlock); - /* Statistics and freeing VMAs */ - mas_set(&mas_detach, 0); - remove_mt(mm, &mas_detach); - validate_mm(mm); - if (unlock) - mmap_read_unlock(mm); - - __mt_destroy(&mt_detach); - return 0; - -clear_tree_failed: -userfaultfd_error: -munmap_gather_failed: -end_split_failed: - mas_set(&mas_detach, 0); - mas_for_each(&mas_detach, next, end) - vma_mark_detached(next, false); - - __mt_destroy(&mt_detach); -start_split_failed: -map_count_exceeded: - validate_mm(mm); - return error; -} - -/* - * do_vmi_munmap() - munmap a given range. - * @vmi: The vma iterator - * @mm: The mm_struct - * @start: The start address to munmap - * @len: The length of the range to munmap - * @uf: The userfaultfd list_head - * @unlock: set to true if the user wants to drop the mmap_lock on success - * - * This function takes a @mas that is either pointing to the previous VMA or set - * to MA_START and sets it up to remove the mapping(s). The @len will be - * aligned and any arch_unmap work will be preformed. - * - * Return: 0 on success and drops the lock if so directed, error and leaves the - * lock held otherwise. - */ -int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, - unsigned long start, size_t len, struct list_head *uf, - bool unlock) -{ - unsigned long end; - struct vm_area_struct *vma; - - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; - - end = start + PAGE_ALIGN(len); - if (end == start) - return -EINVAL; - - /* - * Check if memory is sealed before arch_unmap. - * Prevent unmapping a sealed VMA. - * can_modify_mm assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm(mm, start, end))) - return -EPERM; - - /* arch_unmap() might do unmaps itself. */ - arch_unmap(mm, start, end); - - /* Find the first overlapping VMA */ - vma = vma_find(vmi, end); - if (!vma) { - if (unlock) - mmap_write_unlock(mm); - return 0; - } - - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); -} - /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. * @mm: The mm_struct * @start: The start address to munmap @@ -2852,100 +1363,67 @@ unsigned long mmap_region(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; - struct vm_area_struct *next, *prev, *merge; - pgoff_t pglen = len >> PAGE_SHIFT; + pgoff_t pglen = PHYS_PFN(len); + struct vm_area_struct *merge; unsigned long charged = 0; + struct vma_munmap_struct vms; + struct ma_state mas_detach; + struct maple_tree mt_detach; unsigned long end = addr + len; - unsigned long merge_start = addr, merge_end = end; bool writable_file_mapping = false; - pgoff_t vm_pgoff; - int error; + int error = -ENOMEM; VMA_ITERATOR(vmi, mm, addr); + VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; - - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, end); + vmg.file = file; + /* Find the first overlapping VMA */ + vma = vma_find(&vmi, end); + init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false); + if (vma) { + mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); + mt_on_stack(mt_detach); + mas_init(&mas_detach, &mt_detach, /* addr = */ 0); + /* Prepare to unmap any existing mapping in the area */ + error = vms_gather_munmap_vmas(&vms, &mas_detach); + if (error) + goto gather_failed; - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; + vmg.next = vms.next; + vmg.prev = vms.prev; + vma = NULL; + } else { + vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev); } - /* Unmap any existing mapping in the area */ - error = do_vmi_munmap(&vmi, mm, addr, len, uf, false); - if (error == -EPERM) - return error; - else if (error) - return -ENOMEM; + /* Check against address space limit. */ + if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) + goto abort_munmap; /* * Private writable mapping: check memory availability */ if (accountable_mapping(file, vm_flags)) { - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } - - next = vma_next(&vmi); - prev = vma_prev(&vmi); - if (vm_flags & VM_SPECIAL) { - if (prev) - vma_iter_next_range(&vmi); - goto cannot_expand; - } - - /* Attempt to expand an old mapping */ - /* Check next */ - if (next && next->vm_start == end && !vma_policy(next) && - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, - NULL_VM_UFFD_CTX, NULL)) { - merge_end = next->vm_end; - vma = next; - vm_pgoff = next->vm_pgoff - pglen; - } + charged = pglen; + charged -= vms.nr_accounted; + if (charged && security_vm_enough_memory_mm(mm, charged)) + goto abort_munmap; - /* Check prev */ - if (prev && prev->vm_end == addr && !vma_policy(prev) && - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, - pgoff, vma->vm_userfaultfd_ctx, NULL) : - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, - NULL_VM_UFFD_CTX, NULL))) { - merge_start = prev->vm_start; - vma = prev; - vm_pgoff = prev->vm_pgoff; - } else if (prev) { - vma_iter_next_range(&vmi); + vms.nr_accounted = 0; + vm_flags |= VM_ACCOUNT; + vmg.flags = vm_flags; } - /* Actually expand, if possible */ - if (vma && - !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { - khugepaged_enter_vma(vma, vm_flags); + vma = vma_merge_new_range(&vmg); + if (vma) goto expanded; - } - - if (vma == prev) - vma_iter_set(&vmi, addr); -cannot_expand: - /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; + if (!vma) goto unacct_error; - } vma_iter_config(&vmi, addr, end); vma_set_range(vma, addr, end, pgoff); @@ -2954,6 +1432,11 @@ cannot_expand: if (file) { vma->vm_file = get_file(file); + /* + * call_mmap() may map PTE, so ensure there are no existing PTEs + * and call the vm_ops close function if one exists. + */ + vms_clean_up_area(&vms, &mas_detach); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; @@ -2979,10 +1462,11 @@ cannot_expand: * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge_new_vma(&vmi, prev, vma, - vma->vm_start, vma->vm_end, - vma->vm_pgoff); + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { + vmg.flags = vma->vm_flags; + /* If this fails, state is reset ready for a reattempt. */ + merge = vma_merge_new_range(&vmg); + if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -2998,6 +1482,7 @@ cannot_expand: vm_flags = vma->vm_flags; goto unmap_writable; } + vma_iter_config(&vmi, addr, end); } vm_flags = vma->vm_flags; @@ -3030,7 +1515,7 @@ cannot_expand: vma_link_file(vma); /* - * vma_merge() calls khugepaged_enter_vma() either, the below + * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ khugepaged_enter_vma(vma, vma->vm_flags); @@ -3044,14 +1529,17 @@ unmap_writable: expanded: perf_event_mmap(vma); - vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); + /* Unmap any existing mapping in the area */ + vms_complete_munmap_vmas(&vms, &mas_detach); + + vm_stat_account(mm, vm_flags, pglen); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else - mm->locked_vm += (len >> PAGE_SHIFT); + mm->locked_vm += pglen; } if (file) @@ -3072,7 +1560,7 @@ expanded: return addr; close_and_free_vma: - if (file && vma->vm_ops && vma->vm_ops->close) + if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (file || vma->vm_file) { @@ -3082,8 +1570,7 @@ unmap_and_free_vma: vma_iter_set(&vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, - vma->vm_end, vma->vm_end, true); + unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); } if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); @@ -3092,6 +1579,10 @@ free_vma: unacct_error: if (charged) vm_unacct_memory(charged); + +abort_munmap: + vms_abort_munmap_vmas(&vms, &mas_detach); +gather_failed: validate_mm(mm); return error; } @@ -3211,39 +1702,6 @@ out: } /* - * do_vma_munmap() - Unmap a full or partial vma. - * @vmi: The vma iterator pointing at the vma - * @vma: The first vma to be munmapped - * @start: the start of the address to unmap - * @end: The end of the address to unmap - * @uf: The userfaultfd list_head - * @unlock: Drop the lock on success - * - * unmaps a VMA mapping when the vma iterator is already in position. - * Does not handle alignment. - * - * Return: 0 on success drops the lock of so directed, error on failure and will - * still hold the lock. - */ -int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, struct list_head *uf, - bool unlock) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * Check if memory is sealed before arch_unmap. - * Prevent unmapping a sealed VMA. - * can_modify_mm assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm(mm, start, end))) - return -EPERM; - - arch_unmap(mm, start, end); - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); -} - -/* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address @@ -3259,7 +1717,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vma_prepare vp; /* * Check against address space limits by the changed size @@ -3279,25 +1736,16 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ - if (vma && vma->vm_end == addr && !vma_policy(vma) && - can_vma_merge_after(vma, flags, NULL, NULL, - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - vma_iter_config(vmi, vma->vm_start, addr + len); - if (vma_iter_prealloc(vmi, vma)) - goto unacct_fail; + if (vma && vma->vm_end == addr) { + VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); - vma_start_write(vma); - - init_vma_prep(&vp, vma); - vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - vma->vm_end = addr + len; - vm_flags_set(vma, VM_SOFTDIRTY); - vma_iter_store(vmi, vma); + vmg.prev = vma; + vma_iter_next_range(vmi); - vma_complete(&vp, vmi, mm); - khugepaged_enter_vma(vma, flags); - goto out; + if (vma_merge_new_range(&vmg)) + goto out; + else if (vmg_nomem(&vmg)) + goto unacct_fail; } if (vma) @@ -3433,7 +1881,7 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, true); + remove_vma(vma, /* unreachable = */ true, /* closed = */ false); count++; cond_resched(); vma = vma_next(&vmi); @@ -3491,92 +1939,6 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) } /* - * Copy the vma structure to a new location in the same mm, - * prior to moving page table entries, to effect an mremap move. - */ -struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks) -{ - struct vm_area_struct *vma = *vmap; - unsigned long vma_start = vma->vm_start; - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma, *prev; - bool faulted_in_anon_vma = true; - VMA_ITERATOR(vmi, mm, addr); - - /* - * If anonymous vma has not yet been faulted, update new pgoff - * to match new location, to increase its chance of merging. - */ - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { - pgoff = addr >> PAGE_SHIFT; - faulted_in_anon_vma = false; - } - - new_vma = find_vma_prev(mm, addr, &prev); - if (new_vma && new_vma->vm_start < addr + len) - return NULL; /* should never get here */ - - new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); - if (new_vma) { - /* - * Source vma may have been merged into new_vma - */ - if (unlikely(vma_start >= new_vma->vm_start && - vma_start < new_vma->vm_end)) { - /* - * The only way we can get a vma_merge with - * self during an mremap is if the vma hasn't - * been faulted in yet and we were allowed to - * reset the dst vma->vm_pgoff to the - * destination address of the mremap to allow - * the merge to happen. mremap must change the - * vm_pgoff linearity between src and dst vmas - * (in turn preventing a vma_merge) to be - * safe. It is only safe to keep the vm_pgoff - * linear if there are no pages mapped yet. - */ - VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); - *vmap = vma = new_vma; - } - *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); - } else { - new_vma = vm_area_dup(vma); - if (!new_vma) - goto out; - vma_set_range(new_vma, addr, addr + len, pgoff); - if (vma_dup_policy(vma, new_vma)) - goto out_free_vma; - if (anon_vma_clone(new_vma, vma)) - goto out_free_mempol; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - if (vma_link(mm, new_vma)) - goto out_vma_link; - *need_rmap_locks = false; - } - return new_vma; - -out_vma_link: - if (new_vma->vm_ops && new_vma->vm_ops->close) - new_vma->vm_ops->close(new_vma); - - if (new_vma->vm_file) - fput(new_vma->vm_file); - - unlink_anon_vmas(new_vma); -out_free_mempol: - mpol_put(vma_policy(new_vma)); -out_free_vma: - vm_area_free(new_vma); -out: - return NULL; -} - -/* * Return true if the calling process may expand its vm space by the passed * number of pages */ @@ -3620,10 +1982,16 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* + * Close hook, called for unmap() and on the old vma for mremap(). + * * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { + const struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->close) + sm->close(sm, vma); } static const char *special_mapping_name(struct vm_area_struct *vma) @@ -3665,27 +2033,17 @@ static const struct vm_operations_struct special_mapping_vmops = { .may_split = special_mapping_split, }; -static const struct vm_operations_struct legacy_special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; + struct vm_special_mapping *sm = vma->vm_private_data; - if (vma->vm_ops == &legacy_special_mapping_vmops) { - pages = vma->vm_private_data; - } else { - struct vm_special_mapping *sm = vma->vm_private_data; - - if (sm->fault) - return sm->fault(sm, vmf->vma, vmf); + if (sm->fault) + return sm->fault(sm, vmf->vma, vmf); - pages = sm->pages; - } + pages = sm->pages; for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; @@ -3740,8 +2098,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm) { return vma->vm_private_data == sm && - (vma->vm_ops == &special_mapping_vmops || - vma->vm_ops == &legacy_special_mapping_vmops); + vma->vm_ops == &special_mapping_vmops; } /* @@ -3762,214 +2119,6 @@ struct vm_area_struct *_install_special_mapping( &special_mapping_vmops); } -int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) -{ - struct vm_area_struct *vma = __install_special_mapping( - mm, addr, len, vm_flags, (void *)pages, - &legacy_special_mapping_vmops); - - return PTR_ERR_OR_ZERO(vma); -} - -static DEFINE_MUTEX(mm_all_locks_mutex); - -static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) -{ - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { - /* - * The LSB of head.next can't change from under us - * because we hold the mm_all_locks_mutex. - */ - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); - /* - * We can safely modify head.next after taking the - * anon_vma->root->rwsem. If some other vma in this mm shares - * the same anon_vma we won't take it again. - * - * No need of atomic instructions here, head.next - * can't change from under us thanks to the - * anon_vma->root->rwsem. - */ - if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_root.rb_node)) - BUG(); - } -} - -static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) -{ - if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { - /* - * AS_MM_ALL_LOCKS can't change from under us because - * we hold the mm_all_locks_mutex. - * - * Operations on ->flags have to be atomic because - * even if AS_MM_ALL_LOCKS is stable thanks to the - * mm_all_locks_mutex, there may be other cpus - * changing other bitflags in parallel to us. - */ - if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) - BUG(); - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); - } -} - -/* - * This operation locks against the VM for all pte/vma/mm related - * operations that could ever happen on a certain mm. This includes - * vmtruncate, try_to_unmap, and all page faults. - * - * The caller must take the mmap_lock in write mode before calling - * mm_take_all_locks(). The caller isn't allowed to release the - * mmap_lock until mm_drop_all_locks() returns. - * - * mmap_lock in write mode is required in order to block all operations - * that could modify pagetables and free pages without need of - * altering the vma layout. It's also needed in write mode to avoid new - * anon_vmas to be associated with existing vmas. - * - * A single task can't take more than one mm_take_all_locks() in a row - * or it would deadlock. - * - * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in - * mapping->flags avoid to take the same lock twice, if more than one - * vma in this mm is backed by the same anon_vma or address_space. - * - * We take locks in following order, accordingly to comment at beginning - * of mm/rmap.c: - * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for - * hugetlb mapping); - * - all vmas marked locked - * - all i_mmap_rwsem locks; - * - all anon_vma->rwseml - * - * We can take all locks within these types randomly because the VM code - * doesn't nest them and we protected from parallel mm_take_all_locks() by - * mm_all_locks_mutex. - * - * mm_take_all_locks() and mm_drop_all_locks are expensive operations - * that may have to take thousand of locks. - * - * mm_take_all_locks() can fail if it's interrupted by signals. - */ -int mm_take_all_locks(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - struct anon_vma_chain *avc; - VMA_ITERATOR(vmi, mm, 0); - - mmap_assert_write_locked(mm); - - mutex_lock(&mm_all_locks_mutex); - - /* - * vma_start_write() does not have a complement in mm_drop_all_locks() - * because vma_start_write() is always asymmetrical; it marks a VMA as - * being written to until mmap_write_unlock() or mmap_write_downgrade() - * is reached. - */ - for_each_vma(vmi, vma) { - if (signal_pending(current)) - goto out_unlock; - vma_start_write(vma); - } - - vma_iter_init(&vmi, mm, 0); - for_each_vma(vmi, vma) { - if (signal_pending(current)) - goto out_unlock; - if (vma->vm_file && vma->vm_file->f_mapping && - is_vm_hugetlb_page(vma)) - vm_lock_mapping(mm, vma->vm_file->f_mapping); - } - - vma_iter_init(&vmi, mm, 0); - for_each_vma(vmi, vma) { - if (signal_pending(current)) - goto out_unlock; - if (vma->vm_file && vma->vm_file->f_mapping && - !is_vm_hugetlb_page(vma)) - vm_lock_mapping(mm, vma->vm_file->f_mapping); - } - - vma_iter_init(&vmi, mm, 0); - for_each_vma(vmi, vma) { - if (signal_pending(current)) - goto out_unlock; - if (vma->anon_vma) - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - vm_lock_anon_vma(mm, avc->anon_vma); - } - - return 0; - -out_unlock: - mm_drop_all_locks(mm); - return -EINTR; -} - -static void vm_unlock_anon_vma(struct anon_vma *anon_vma) -{ - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { - /* - * The LSB of head.next can't change to 0 from under - * us because we hold the mm_all_locks_mutex. - * - * We must however clear the bitflag before unlocking - * the vma so the users using the anon_vma->rb_root will - * never see our bitflag. - * - * No need of atomic instructions here, head.next - * can't change from under us until we release the - * anon_vma->root->rwsem. - */ - if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->root->rb_root.rb_root.rb_node)) - BUG(); - anon_vma_unlock_write(anon_vma); - } -} - -static void vm_unlock_mapping(struct address_space *mapping) -{ - if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { - /* - * AS_MM_ALL_LOCKS can't change to 0 from under us - * because we hold the mm_all_locks_mutex. - */ - i_mmap_unlock_write(mapping); - if (!test_and_clear_bit(AS_MM_ALL_LOCKS, - &mapping->flags)) - BUG(); - } -} - -/* - * The mmap_lock cannot be released by the caller until - * mm_drop_all_locks() returns. - */ -void mm_drop_all_locks(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - struct anon_vma_chain *avc; - VMA_ITERATOR(vmi, mm, 0); - - mmap_assert_write_locked(mm); - BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - - for_each_vma(vmi, vma) { - if (vma->anon_vma) - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - vm_unlock_anon_vma(avc->anon_vma); - if (vma->vm_file && vma->vm_file->f_mapping) - vm_unlock_mapping(vma->vm_file->f_mapping); - } - - mutex_unlock(&mm_all_locks_mutex); -} - /* * initialise the percpu counter for VM */ @@ -4088,3 +2237,86 @@ static int __meminit init_reserve_notifier(void) return 0; } subsys_initcall(init_reserve_notifier); + +/* + * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between + * this VMA and its relocated range, which will now reside at [vma->vm_start - + * shift, vma->vm_end - shift). + * + * This function is almost certainly NOT what you want for anything other than + * early executable temporary stack relocation. + */ +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) +{ + /* + * The process proceeds as follows: + * + * 1) Use shift to calculate the new vma endpoints. + * 2) Extend vma to cover both the old and new ranges. This ensures the + * arguments passed to subsequent functions are consistent. + * 3) Move vma's page tables to the new range. + * 4) Free up any cleared pgd range. + * 5) Shrink the vma to cover only the new range. + */ + + struct mm_struct *mm = vma->vm_mm; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); + struct vm_area_struct *next; + struct mmu_gather tlb; + + BUG_ON(new_start > new_end); + + /* + * ensure there are no vmas between where we want to go + * and where we are + */ + if (vma != vma_next(&vmi)) + return -EFAULT; + + vma_iter_prev_range(&vmi); + /* + * cover the whole range: [new_start, old_end) + */ + vmg.vma = vma; + if (vma_expand(&vmg)) + return -ENOMEM; + + /* + * move the page tables downwards, on failure we rely on + * process cleanup to remove whatever mess we made. + */ + if (length != move_page_tables(vma, old_start, + vma, new_start, length, false, true)) + return -ENOMEM; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } else { + /* + * otherwise, clean from old_start; this is done to not touch + * the address space in [new_end, old_start) some architectures + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, + next ? next->vm_start : USER_PGTABLES_CEILING); + } + tlb_finish_mmu(&tlb); + + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8982e6139d07..fc18fe274505 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -19,6 +19,8 @@ #include <linux/sched/mm.h> #include <linux/slab.h> +#include "vma.h" + /* global SRCU for all MMs */ DEFINE_STATIC_SRCU(srcu); diff --git a/mm/mmzone.c b/mm/mmzone.c index c01896eca736..f9baa8882fbf 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -66,7 +66,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, z++; else while (zonelist_zone_idx(z) > highest_zoneidx || - (z->zone && !zref_in_nodemask(z, nodes))) + (zonelist_zone(z) && !zref_in_nodemask(z, nodes))) z++; return z; diff --git a/mm/mprotect.c b/mm/mprotect.c index 222ab434da54..0c5d6d06107d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -161,8 +161,7 @@ static long change_pte_range(struct mmu_gather *tlb, if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) continue; - if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && - !toptier) + if (folio_use_access_time(folio)) folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); } @@ -303,8 +302,9 @@ pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags) { /* * pte markers only resides in pte level, if we need pte markers, - * we need to split. We cannot wr-protect shmem thp because file - * thp is handled differently when split by erasing the pmd so far. + * we need to split. For example, we cannot wr-protect a file thp + * (e.g. 2M shmem) because file thp is handled differently when + * split by erasing the pmd so far. */ return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); } @@ -364,9 +364,6 @@ static inline long change_pmd_range(struct mmu_gather *tlb, unsigned long next; long pages = 0; unsigned long nr_huge_updates = 0; - struct mmu_notifier_range range; - - range.start = 0; pmd = pmd_offset(pud, addr); do { @@ -384,14 +381,6 @@ again: if (pmd_none(*pmd)) goto next; - /* invoke the mmu notifier if the pmd is populated */ - if (!range.start) { - mmu_notifier_range_init(&range, - MMU_NOTIFY_PROTECTION_VMA, 0, - vma->vm_mm, addr, end); - mmu_notifier_invalidate_range_start(&range); - } - _pmd = pmdp_get_lockless(pmd); if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || @@ -432,9 +421,6 @@ next: cond_resched(); } while (pmd++, addr = next, addr != end); - if (range.start) - mmu_notifier_invalidate_range_end(&range); - if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; @@ -444,21 +430,57 @@ static inline long change_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { - pud_t *pud; + struct mmu_notifier_range range; + pud_t *pudp, pud; unsigned long next; long pages = 0, ret; - pud = pud_offset(p4d, addr); + range.start = 0; + + pudp = pud_offset(p4d, addr); do { +again: next = pud_addr_end(addr, end); - ret = change_prepare(vma, pud, pmd, addr, cp_flags); - if (ret) - return ret; - if (pud_none_or_clear_bad(pud)) + ret = change_prepare(vma, pudp, pmd, addr, cp_flags); + if (ret) { + pages = ret; + break; + } + + pud = READ_ONCE(*pudp); + if (pud_none(pud)) continue; - pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, + + if (!range.start) { + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma->vm_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); + } + + if (pud_leaf(pud)) { + if ((next - addr != PUD_SIZE) || + pgtable_split_needed(vma, cp_flags)) { + __split_huge_pud(vma, pudp, addr); + goto again; + } else { + ret = change_huge_pud(tlb, vma, pudp, + addr, newprot, cp_flags); + if (ret == 0) + goto again; + /* huge pud was handled */ + if (ret == HPAGE_PUD_NR) + pages += HPAGE_PUD_NR; + continue; + } + } + + pages += change_pmd_range(tlb, vma, pudp, addr, next, newprot, cp_flags); - } while (pud++, addr = next, addr != end); + } while (pudp++, addr = next, addr != end); + + if (range.start) + mmu_notifier_invalidate_range_end(&range); return pages; } @@ -589,6 +611,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long charged = 0; int error; + if (!can_modify_vma(vma)) + return -EPERM; + if (newflags == oldflags) { *pprev = vma; return 0; @@ -747,15 +772,6 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } - /* - * checking if memory is sealed. - * can_modify_mm assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm(current->mm, start, end))) { - error = -EPERM; - goto out; - } - prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; diff --git a/mm/mremap.c b/mm/mremap.c index e7ae140fc640..24712f8dbb6b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -902,19 +902,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if ((mm->map_count + 2) >= sysctl_max_map_count - 3) return -ENOMEM; - /* - * In mremap_to(). - * Move a VMA to another location, check if src addr is sealed. - * - * Place can_modify_mm here because mremap_to() - * does its own checking for address range, and we only - * check the sealing after passing those checks. - * - * can_modify_mm assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) - return -EPERM; - if (flags & MREMAP_FIXED) { /* * In mremap_to(). @@ -1052,6 +1039,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } + /* Don't allow remapping vmas when they have already been sealed */ + if (!can_modify_vma(vma)) { + ret = -EPERM; + goto out; + } + if (is_vm_hugetlb_page(vma)) { struct hstate *h __maybe_unused = hstate_vma(vma); @@ -1080,19 +1073,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Below is shrink/expand case (not mremap_to()) - * Check if src address is sealed, if so, reject. - * In other words, prevent shrinking or expanding a sealed VMA. - * - * Place can_modify_mm here so we can keep the logic related to - * shrink/expand together. - */ - if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) { - ret = -EPERM; - goto out; - } - - /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. * do_vmi_munmap does all the needed commit accounting, and diff --git a/mm/mseal.c b/mm/mseal.c index c8787cc6ba55..ece977bd21e1 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -16,28 +16,11 @@ #include <linux/sched.h> #include "internal.h" -static inline bool vma_is_sealed(struct vm_area_struct *vma) -{ - return (vma->vm_flags & VM_SEALED); -} - static inline void set_vma_sealed(struct vm_area_struct *vma) { vm_flags_set(vma, VM_SEALED); } -/* - * check if a vma is sealed for modification. - * return true, if modification is allowed. - */ -static bool can_modify_vma(struct vm_area_struct *vma) -{ - if (unlikely(vma_is_sealed(vma))) - return false; - - return true; -} - static bool is_madv_discard(int behavior) { switch (behavior) { @@ -71,45 +54,15 @@ static bool is_ro_anon(struct vm_area_struct *vma) } /* - * Check if the vmas of a memory range are allowed to be modified. - * the memory ranger can have a gap (unallocated memory). - * return true, if it is allowed. - */ -bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) -{ - struct vm_area_struct *vma; - - VMA_ITERATOR(vmi, mm, start); - - /* going through each vma to check. */ - for_each_vma_range(vmi, vma, end) { - if (unlikely(!can_modify_vma(vma))) - return false; - } - - /* Allow by default. */ - return true; -} - -/* - * Check if the vmas of a memory range are allowed to be modified by madvise. - * the memory ranger can have a gap (unallocated memory). - * return true, if it is allowed. + * Check if a vma is allowed to be modified by madvise. */ -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, - int behavior) +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { - struct vm_area_struct *vma; - - VMA_ITERATOR(vmi, mm, start); - if (!is_madv_discard(behavior)) return true; - /* going through each vma to check. */ - for_each_vma_range(vmi, vma, end) - if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) - return false; + if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) + return false; /* Allow by default. */ return true; diff --git a/mm/nommu.c b/mm/nommu.c index 7296e775e04e..385b0c15add8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -126,6 +126,11 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc_noprof); +void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); +} + void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, @@ -1573,12 +1578,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return ret; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) -{ - return NULL; -} - int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { diff --git a/mm/numa.c b/mm/numa.c new file mode 100644 index 000000000000..e2eec07707d1 --- /dev/null +++ b/mm/numa.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/memblock.h> +#include <linux/printk.h> +#include <linux/numa.h> +#include <linux/numa_memblks.h> + +struct pglist_data *node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); + +/* Allocate NODE_DATA for a node on the local memory */ +void __init alloc_node_data(int nid) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); + u64 nd_pa; + void *nd; + int tnid; + + /* Allocate node data. Try node-local memory and then any node. */ + nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) + panic("Cannot allocate %zu bytes for node %d data\n", + nd_size, nid); + nd = __va(nd_pa); + + /* report and initialize */ + pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); +} + +void __init alloc_offline_node_data(int nid) +{ + pg_data_t *pgdat; + + pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); + if (!pgdat) + panic("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + + node_data[nid] = pgdat; +} + +/* Stub functions: */ + +#ifndef memory_add_physaddr_to_nid +int memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + +#ifndef phys_to_target_node +int phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(phys_to_target_node); +#endif diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c new file mode 100644 index 000000000000..031fb9961bf7 --- /dev/null +++ b/mm/numa_emulation.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NUMA emulation + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/topology.h> +#include <linux/memblock.h> +#include <linux/numa_memblks.h> +#include <asm/numa.h> + +#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) + +static int emu_nid_to_phys[MAX_NUMNODES]; +static char *emu_cmdline __initdata; + +int __init numa_emu_cmdline(char *str) +{ + emu_cmdline = str; + return 0; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + +static u64 __init mem_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = PFN_DOWN(end); + + if (start_pfn < end_pfn) + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); + return 0; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start = pb->start; + eb->end = pb->start + size; + eb->nid = nid; + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = pb->nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask = numa_nodes_parsed; + u64 size; + int big; + int nid = 0; + int i, ret; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - mem_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (!nodes_empty(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = numa_emu_dma_end(); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; + + if (nid < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > limit) { + end = limit; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - mem_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) +{ + unsigned long max_pfn = PHYS_PFN(max_addr); + unsigned long base_pfn = PHYS_PFN(base); + unsigned long hole_pfns = PHYS_PFN(hole); + + return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size, + int nr_nodes, struct numa_memblk *pblk, + int nid) +{ + nodemask_t physnode_mask = numa_nodes_parsed; + int i, ret, uniform = 0; + u64 min_size; + + if ((!size && !nr_nodes) || (nr_nodes && !pblk)) + return -1; + + /* + * In the 'uniform' case split the passed in physical node by + * nr_nodes, in the non-uniform case, ignore the passed in + * physical block and try to create nodes of at least size + * @size. + * + * In the uniform case, split the nodes strictly by physical + * capacity, i.e. ignore holes. In the non-uniform case account + * for holes and treat @size as a minimum floor. + */ + if (!nr_nodes) + nr_nodes = MAX_NUMNODES; + else { + nodes_clear(physnode_mask); + node_set(pblk->nid, physnode_mask); + uniform = 1; + } + + if (uniform) { + min_size = uniform_size(max_addr, addr, 0, nr_nodes); + size = min_size; + } else { + /* + * The limit on emulated nodes is MAX_NUMNODES, so the + * size per node is increased accordingly if the + * requested size is too small. This creates a uniform + * distribution of node sizes across the entire machine + * (but not necessarily over physical nodes). + */ + min_size = uniform_size(max_addr, addr, + mem_hole_size(addr, max_addr), nr_nodes); + } + min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (!nodes_empty(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = numa_emu_dma_end(); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + if (uniform) + end = start + size; + else + end = find_end_of_node(start, limit, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if ((limit - end - mem_hole_size(end, limit) < size) + && !uniform) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return nid; +} + +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, + 0, NULL, 0); +} + +static int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr = PFN_PHYS(max_pfn); + u8 *phys_dist = NULL; + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int max_emu_nid, dfl_phys_nid; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi = *numa_meminfo; + + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'U')) { + nodemask_t physnode_mask = numa_nodes_parsed; + unsigned long n; + int nid = 0; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = -1; + for_each_node_mask(i, physnode_mask) { + /* + * The reason we pass in blk[0] is due to + * numa_remove_memblk_from() called by + * emu_setup_memblk() will delete entry 0 + * and then move everything else up in the pi.blk + * array. Therefore we should always be looking + * at blk[0]. + */ + ret = split_nodes_size_interleave_uniform(&ei, &pi, + pi.blk[0].start, pi.blk[0].end, 0, + n, &pi.blk[0], nid); + if (ret < 0) + break; + if (ret < n) { + pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", + __func__, i, ret, n); + ret = -1; + break; + } + nid = ret; + } + } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size = memparse(emu_cmdline, &emu_cmdline); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + if (*emu_cmdline == ':') + emu_cmdline++; + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + goto no_emu; + } + + /* copy the physical distance table */ + if (numa_dist_cnt) { + phys_dist = memblock_alloc(phys_size, PAGE_SIZE); + if (!phys_dist) { + pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + goto no_emu; + } + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); + } + + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); + + /* commit */ + *numa_meminfo = ei; + + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + + numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); + + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = dfl_phys_nid; + + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < max_emu_nid + 1; i++) { + for (j = 0; j < max_emu_nid + 1; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (get_option(&emu_cmdline, &dist) == 2) + ; + else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + + /* free the copied physical distance table */ + memblock_free(phys_dist, phys_size); + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void numa_add_cpu(unsigned int cpu) +{ + int physnid, nid; + + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + physnid = emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void numa_remove_cpu(unsigned int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void numa_set_cpumask(unsigned int cpu, bool enable) +{ + int nid, physnid; + + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(nid) { + if (emu_nid_to_phys[nid] != physnid) + continue; + + debug_cpumask_set_cpu(cpu, nid, enable); + } +} + +void numa_add_cpu(unsigned int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(unsigned int cpu) +{ + numa_set_cpumask(cpu, false); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c new file mode 100644 index 000000000000..be52b93a9c58 --- /dev/null +++ b/mm/numa_memblks.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/array_size.h> +#include <linux/sort.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/numa.h> +#include <linux/numa_memblks.h> + +static int numa_distance_cnt; +static u8 *numa_distance; + +nodemask_t numa_nodes_parsed __initdata; + +static struct numa_meminfo numa_meminfo __initdata_or_meminfo; +static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_free(numa_distance, size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + numa_distance = memblock_alloc(size, PAGE_SIZE); + if (!numa_distance) { + pr_warn("Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] = src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unnecessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = memblock_start_of_DRAM(); + const u64 high = memblock_end_of_DRAM(); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ + bi->start = max(bi->start, low); + + /* preserve info for non-RAM areas above 'max_pfn': */ + if (bi->end > high) { + numa_add_memblk_to(bi->nid, high, bi->end, + &numa_reserved_meminfo); + bi->end = high; + } + + /* and there's no empty block */ + if (bi->start >= bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); + return -EINVAL; + } + pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ +static void __init numa_clear_kernel_node_hotplug(void) +{ + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; + + /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * + * At this time, all memory regions reserved by memblock are + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all parsed memory blocks and use those ranges to + * set the nid in memblock.reserved. This will split up the + * memblock regions along node boundaries and will set the node IDs + * as well. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; + + ret = memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); + } + + /* + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. + * + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] + */ + for_each_reserved_mem_region(mb_region) { + int nid = memblock_get_region_node(mb_region); + + if (nid != MAX_NUMNODES) + node_set(nid, reserved_nodemask); + } + + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + + if (!node_isset(mb->nid, reserved_nodemask)) + continue; + + memblock_clear_hotplug(mb->start, mb->end - mb->start); + } +} + +static int __init numa_register_meminfo(struct numa_meminfo *mi) +{ + int i; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); + } + + /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, alloc node data won't fail. + */ + numa_clear_kernel_node_hotplug(); + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ + if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { + unsigned long pfn_align = node_map_pfn_alignment(); + + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20; + + unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20; + + pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n", + node_align_mb, sect_align_mb); + return -EINVAL; + } + } + + return 0; +} + +int __init numa_memblks_init(int (*init_func)(void), + bool memblock_force_top_down) +{ + phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + int ret; + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE)); + WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved, + NUMA_NO_NODE)); + /* In case that parsing SRAT failed. */ + WARN_ON(memblock_clear_hotplug(0, max_addr)); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + + /* + * We reset memblock back to the top-down direction + * here because if we configured ACPI_NUMA, we have + * parsed SRAT in init_func(). It is ok to have the + * reset here even if we did't configure ACPI_NUMA + * or acpi numa init fails and fallbacks to dummy + * numa init. + */ + if (memblock_force_top_down) + memblock_set_bottom_up(false); + + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + return numa_register_meminfo(&numa_meminfo); +} + +static int __init cmp_memblk(const void *a, const void *b) +{ + const struct numa_memblk *ma = *(const struct numa_memblk **)a; + const struct numa_memblk *mb = *(const struct numa_memblk **)b; + + return (ma->start > mb->start) - (ma->start < mb->start); +} + +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; + +/** + * numa_fill_memblks - Fill gaps in numa_meminfo memblks + * @start: address to begin fill + * @end: address to end fill + * + * Find and extend numa_meminfo memblks to cover the physical + * address range @start-@end + * + * RETURNS: + * 0 : Success + * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end + */ + +int __init numa_fill_memblks(u64 start, u64 end) +{ + struct numa_memblk **blk = &numa_memblk_list[0]; + struct numa_meminfo *mi = &numa_meminfo; + int count = 0; + u64 prev_end; + + /* + * Create a list of pointers to numa_meminfo memblks that + * overlap start, end. The list is used to make in-place + * changes that fill out the numa_meminfo memblks. + */ + for (int i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + if (memblock_addrs_overlap(start, end - start, bi->start, + bi->end - bi->start)) { + blk[count] = &mi->blk[i]; + count++; + } + } + if (!count) + return NUMA_NO_MEMBLK; + + /* Sort the list of pointers in memblk->start order */ + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); + + /* Make sure the first/last memblks include start/end */ + blk[0]->start = min(blk[0]->start, start); + blk[count - 1]->end = max(blk[count - 1]->end, end); + + /* + * Fill any gaps by tracking the previous memblks + * end address and backfilling to it if needed. + */ + prev_end = blk[0]->end; + for (int i = 1; i < count; i++) { + struct numa_memblk *curr = blk[i]; + + if (prev_end >= curr->start) { + if (prev_end < curr->end) + prev_end = curr->end; + } else { + curr->start = prev_end; + prev_end = curr->end; + } + } + return 0; +} + +#ifdef CONFIG_NUMA_KEEP_MEMINFO +static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + return mi->blk[i].nid; + return NUMA_NO_NODE; +} + +int phys_to_target_node(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + /* + * Prefer online nodes, but if reserved memory might be + * hot-added continue the search with reserved ranges. + */ + if (nid != NUMA_NO_NODE) + return nid; + + return meminfo_to_nid(&numa_reserved_meminfo, start); +} +EXPORT_SYMBOL_GPL(phys_to_target_node); + +int memory_add_physaddr_to_nid(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + if (nid == NUMA_NO_NODE) + nid = numa_meminfo.blk[0].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +#endif /* CONFIG_NUMA_KEEP_MEMINFO */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7a04cb1918fd..fcd4c1439cb9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2612,7 +2612,7 @@ struct folio *writeback_iter(struct address_space *mapping, done: if (wbc->range_cyclic) - mapping->writeback_index = folio->index + folio_nr_pages(folio); + mapping->writeback_index = folio_next_index(folio); folio_batch_release(&wbc->fbatch); return NULL; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0aefae4a26b2..8afab64814dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -286,9 +286,7 @@ EXPORT_SYMBOL(nr_online_nodes); #endif static bool page_contains_unaccepted(struct page *page, unsigned int order); -static void accept_page(struct page *page, unsigned int order); static bool cond_accept_memory(struct zone *zone, unsigned int order); -static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page); int page_group_by_mobility_disabled __read_mostly; @@ -322,6 +320,11 @@ static inline bool deferred_pages_enabled(void) { return false; } + +static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return false; +} #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /* Return a pointer to the bitmap storing bits affecting a block of pages */ @@ -958,8 +961,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) break; case 2: /* the second tail page: deferred_list overlaps ->mapping */ - if (unlikely(!list_empty(&folio->_deferred_list))) { - bad_page(page, "on deferred list"); + if (unlikely(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio))) { + bad_page(page, "partially mapped folio on deferred list"); goto out; } break; @@ -1087,8 +1091,11 @@ __always_inline bool free_pages_prepare(struct page *page, (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } - if (PageMappingFlags(page)) + if (PageMappingFlags(page)) { + if (PageAnon(page)) + mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); page->mapping = NULL; + } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) bad++; @@ -1199,17 +1206,39 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock_irqrestore(&zone->lock, flags); } +/* Split a multi-block free page into its individual pageblocks. */ +static void split_large_buddy(struct zone *zone, struct page *page, + unsigned long pfn, int order, fpi_t fpi) +{ + unsigned long end = pfn + (1 << order); + + VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order)); + /* Caller removed page from freelist, buddy info cleared! */ + VM_WARN_ON_ONCE(PageBuddy(page)); + + if (order > pageblock_order) + order = pageblock_order; + + while (pfn != end) { + int mt = get_pfnblock_migratetype(page, pfn); + + __free_one_page(page, pfn, zone, order, mt, fpi); + pfn += 1 << order; + page = pfn_to_page(pfn); + } +} + static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, fpi_t fpi_flags) { unsigned long flags; - int migratetype; spin_lock_irqsave(&zone->lock, flags); - migratetype = get_pfnblock_migratetype(page, pfn); - __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + split_large_buddy(zone, page, pfn, order, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); + + __count_vm_events(PGFREE, 1 << order); } static void __free_pages_ok(struct page *page, unsigned int order, @@ -1218,12 +1247,8 @@ static void __free_pages_ok(struct page *page, unsigned int order, unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); - if (!free_pages_prepare(page, order)) - return; - - free_one_page(zone, page, pfn, order, fpi_flags); - - __count_vm_events(PGFREE, 1 << order); + if (free_pages_prepare(page, order)) + free_one_page(zone, page, pfn, order, fpi_flags); } void __meminit __free_pages_core(struct page *page, unsigned int order, @@ -1270,7 +1295,7 @@ void __meminit __free_pages_core(struct page *page, unsigned int order, if (order == MAX_PAGE_ORDER && __free_unaccepted(page)) return; - accept_page(page, order); + accept_memory(page_to_phys(page), PAGE_SIZE << order); } /* @@ -1346,11 +1371,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, * * -- nyc */ -static inline void expand(struct zone *zone, struct page *page, - int low, int high, int migratetype) +static inline unsigned int expand(struct zone *zone, struct page *page, int low, + int high, int migratetype) { - unsigned long size = 1 << high; - unsigned long nr_added = 0; + unsigned int size = 1 << high; + unsigned int nr_added = 0; while (high > low) { high--; @@ -1370,7 +1395,19 @@ static inline void expand(struct zone *zone, struct page *page, set_buddy_order(&page[size], high); nr_added += size; } - account_freepages(zone, nr_added, migratetype); + + return nr_added; +} + +static __always_inline void page_del_and_expand(struct zone *zone, + struct page *page, int low, + int high, int migratetype) +{ + int nr_pages = 1 << high; + + __del_page_from_free_list(page, zone, high, migratetype); + nr_pages -= expand(zone, page, low, high, migratetype); + account_freepages(zone, -nr_pages, migratetype); } static void check_new_page_bad(struct page *page) @@ -1540,8 +1577,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_list(page, zone, current_order, migratetype); - expand(zone, page, order, current_order, migratetype); + + page_del_and_expand(zone, page, order, current_order, + migratetype); trace_mm_page_alloc_zone_locked(page, order, migratetype, pcp_allowed_order(order) && migratetype < MIGRATE_PCPTYPES); @@ -1700,27 +1738,6 @@ static unsigned long find_large_buddy(unsigned long start_pfn) return start_pfn; } -/* Split a multi-block free page into its individual pageblocks */ -static void split_large_buddy(struct zone *zone, struct page *page, - unsigned long pfn, int order) -{ - unsigned long end_pfn = pfn + (1 << order); - - VM_WARN_ON_ONCE(order <= pageblock_order); - VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1)); - - /* Caller removed page from freelist, buddy info cleared! */ - VM_WARN_ON_ONCE(PageBuddy(page)); - - while (pfn != end_pfn) { - int mt = get_pfnblock_migratetype(page, pfn); - - __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE); - pfn += pageblock_nr_pages; - page = pfn_to_page(pfn); - } -} - /** * move_freepages_block_isolate - move free pages in block for page isolation * @zone: the zone @@ -1761,7 +1778,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, del_page_from_free_list(buddy, zone, order, get_pfnblock_migratetype(buddy, pfn)); set_pageblock_migratetype(page, migratetype); - split_large_buddy(zone, buddy, pfn, order); + split_large_buddy(zone, buddy, pfn, order, FPI_NONE); return true; } @@ -1772,7 +1789,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, del_page_from_free_list(page, zone, order, get_pfnblock_migratetype(page, pfn)); set_pageblock_migratetype(page, migratetype); - split_large_buddy(zone, page, pfn, order); + split_large_buddy(zone, page, pfn, order, FPI_NONE); return true; } move: @@ -1892,9 +1909,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page, /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { + unsigned int nr_added; + del_page_from_free_list(page, zone, current_order, block_type); change_pageblock_range(page, current_order, start_type); - expand(zone, page, order, current_order, start_type); + nr_added = expand(zone, page, order, current_order, start_type); + account_freepages(zone, nr_added, start_type); return page; } @@ -1947,8 +1967,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page, } single_page: - del_page_from_free_list(page, zone, current_order, block_type); - expand(zone, page, order, current_order, block_type); + page_del_and_expand(zone, page, order, current_order, block_type); return page; } @@ -2764,7 +2783,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order, 0); - pgalloc_tag_split(page, 1 << order); + pgalloc_tag_split(page_folio(page), order, 0); split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -3033,12 +3052,6 @@ struct page *rmqueue(struct zone *preferred_zone, { struct page *page; - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - if (likely(pcp_allowed_order(order))) { page = rmqueue_pcplist(preferred_zone, zone, order, migratetype, alloc_flags); @@ -3357,7 +3370,7 @@ retry: } if (no_fallback && nr_online_nodes > 1 && - zone != ac->preferred_zoneref->zone) { + zone != zonelist_zone(ac->preferred_zoneref)) { int local_nid; /* @@ -3365,7 +3378,7 @@ retry: * fragmenting fallbacks. Locality is more important * than fragmentation avoidance. */ - local_nid = zone_to_nid(ac->preferred_zoneref->zone); + local_nid = zonelist_node_idx(ac->preferred_zoneref); if (zone_to_nid(zone) != local_nid) { alloc_flags &= ~ALLOC_NOFRAGMENT; goto retry; @@ -3402,7 +3415,6 @@ check_alloc_wmark: if (cond_accept_memory(zone, order)) goto try_this_zone; -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. @@ -3411,14 +3423,13 @@ check_alloc_wmark: if (_deferred_grow_zone(zone, order)) goto try_this_zone; } -#endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; if (!node_reclaim_enabled() || - !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) + !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone)) continue; ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); @@ -3440,7 +3451,7 @@ check_alloc_wmark: } try_this_zone: - page = rmqueue(ac->preferred_zoneref->zone, zone, order, + page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); @@ -3457,13 +3468,11 @@ try_this_zone: if (cond_accept_memory(zone, order)) goto try_this_zone; -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } -#endif } } @@ -4100,6 +4109,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, unsigned long min_wmark = min_wmark_pages(zone); bool wmark; + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + available = reclaimable = zone_reclaimable_pages(zone); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); @@ -4175,6 +4189,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; bool can_compact = gfp_compaction_allowed(gfp_mask); + bool nofail = gfp_mask & __GFP_NOFAIL; const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@ -4187,6 +4202,25 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; + if (unlikely(nofail)) { + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 1); + /* + * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM, + * otherwise, we may result in lockup. + */ + WARN_ON_ONCE(!can_direct_reclaim); + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us. + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + } + restart: compaction_retries = 0; no_progress_loops = 0; @@ -4209,7 +4243,7 @@ restart: */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, ac->nodemask); - if (!ac->preferred_zoneref->zone) + if (!zonelist_zone(ac->preferred_zoneref)) goto nopage; /* @@ -4221,7 +4255,7 @@ restart: struct zoneref *z = first_zones_zonelist(ac->zonelist, ac->highest_zoneidx, &cpuset_current_mems_allowed); - if (!z->zone) + if (!zonelist_zone(z)) goto nopage; } @@ -4404,30 +4438,16 @@ nopage: * Make sure that __GFP_NOFAIL request doesn't leak out and make sure * we always retry */ - if (gfp_mask & __GFP_NOFAIL) { + if (unlikely(nofail)) { /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually require GFP_NOWAIT + * Lacking direct_reclaim we can't do anything to reclaim memory, + * we disregard these unreasonable nofail requests and still + * return NULL */ - if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) + if (!can_direct_reclaim) goto fail; /* - * PF_MEMALLOC request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us - */ - WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask); - - /* - * non failing costly orders are a hard requirement which we - * are not prepared for much so let's warn about these users - * so that we can identify them and convert them to something - * else. - */ - WARN_ON_ONCE_GFP(costly_order, gfp_mask); - - /* * Help non-failing allocations by giving some access to memory * reserves normally used for high priority non-blocking * allocations but do not use ALLOC_NO_WATERMARKS because this @@ -4578,17 +4598,28 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, continue; } - if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && - zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + if (nr_online_nodes > 1 && zone != zonelist_zone(ac.preferred_zoneref) && + zone_to_nid(zone) != zonelist_node_idx(ac.preferred_zoneref)) { goto failed; } + cond_accept_memory(zone, 0); +retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, zonelist_zone_idx(ac.preferred_zoneref), alloc_flags, gfp)) { break; } + + if (cond_accept_memory(zone, 0)) + goto retry_this_zone; + + /* Try again if zone has deferred pages */ + if (deferred_pages_enabled()) { + if (_deferred_grow_zone(zone, 0)) + goto retry_this_zone; + } } /* @@ -4638,7 +4669,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, pcp_trylock_finish(UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); - zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); + zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); out: return nr_populated; @@ -4696,7 +4727,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ - alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); + alloc_flags |= alloc_flags_nofragment(zonelist_zone(ac.preferred_zoneref), gfp); /* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); @@ -4950,7 +4981,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, order, 0); - pgalloc_tag_split(page, 1 << order); + pgalloc_tag_split(page_folio(page), order, 0); split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); @@ -5301,7 +5332,7 @@ int local_memory_node(int node) z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); - return zone_to_nid(z->zone); + return zonelist_node_idx(z); } #endif @@ -6433,6 +6464,31 @@ int __alloc_contig_migrate_range(struct compact_control *cc, return (ret < 0) ? ret : 0; } +static void split_free_pages(struct list_head *list) +{ + int order; + + for (order = 0; order < NR_PAGE_ORDERS; order++) { + struct page *page, *next; + int nr_pages = 1 << order; + + list_for_each_entry_safe(page, next, &list[order], lru) { + int i; + + post_alloc_hook(page, order, __GFP_MOVABLE); + if (!order) + continue; + + split_page(page, order); + + /* Add all subpages to the order-0 head, in sequence. */ + list_del(&page->lru); + for (i = 0; i < nr_pages; i++) + list_add_tail(&page[i].lru, &list[0]); + } + } +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -6545,12 +6601,25 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, goto done; } - /* Free head and tail (if any) */ - if (start != outer_start) - free_contig_range(outer_start, start - outer_start); - if (end != outer_end) - free_contig_range(end, outer_end - end); + if (!(gfp_mask & __GFP_COMP)) { + split_free_pages(cc.freepages); + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { + struct page *head = pfn_to_page(start); + int order = ilog2(end - start); + + check_new_pages(head, order); + prep_new_page(head, order, gfp_mask, 0); + } else { + ret = -EINVAL; + WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", + start, end, outer_start, outer_end); + } done: undo_isolate_page_range(start, end, migratetype); return ret; @@ -6659,6 +6728,18 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, void free_contig_range(unsigned long pfn, unsigned long nr_pages) { unsigned long count = 0; + struct folio *folio = pfn_folio(pfn); + + if (folio_test_large(folio)) { + int expected = folio_nr_pages(folio); + + if (nr_pages == expected) + folio_put(folio); + else + WARN(true, "PFN %lu: nr_pages %lu != expected %d\n", + pfn, nr_pages, expected); + return; + } for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -6927,23 +7008,50 @@ early_param("accept_memory", accept_memory_parse); static bool page_contains_unaccepted(struct page *page, unsigned int order) { phys_addr_t start = page_to_phys(page); - phys_addr_t end = start + (PAGE_SIZE << order); - return range_contains_unaccepted_memory(start, end); + return range_contains_unaccepted_memory(start, PAGE_SIZE << order); } -static void accept_page(struct page *page, unsigned int order) +static void __accept_page(struct zone *zone, unsigned long *flags, + struct page *page) { - phys_addr_t start = page_to_phys(page); + bool last; + + list_del(&page->lru); + last = list_empty(&zone->unaccepted_pages); + + account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); + __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); + __ClearPageUnaccepted(page); + spin_unlock_irqrestore(&zone->lock, *flags); + + accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER); + + __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - accept_memory(start, start + (PAGE_SIZE << order)); + if (last) + static_branch_dec(&zones_with_unaccepted_pages); +} + +void accept_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); + if (!PageUnaccepted(page)) { + spin_unlock_irqrestore(&zone->lock, flags); + return; + } + + /* Unlocks zone->lock */ + __accept_page(zone, &flags, page); } static bool try_to_accept_memory_one(struct zone *zone) { unsigned long flags; struct page *page; - bool last; spin_lock_irqsave(&zone->lock, flags); page = list_first_entry_or_null(&zone->unaccepted_pages, @@ -6953,23 +7061,17 @@ static bool try_to_accept_memory_one(struct zone *zone) return false; } - list_del(&page->lru); - last = list_empty(&zone->unaccepted_pages); - - account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); - __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); - spin_unlock_irqrestore(&zone->lock, flags); - - accept_page(page, MAX_PAGE_ORDER); - - __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - - if (last) - static_branch_dec(&zones_with_unaccepted_pages); + /* Unlocks zone->lock */ + __accept_page(zone, &flags, page); return true; } +static inline bool has_unaccepted_memory(void) +{ + return static_branch_unlikely(&zones_with_unaccepted_pages); +} + static bool cond_accept_memory(struct zone *zone, unsigned int order) { long to_accept; @@ -6981,8 +7083,8 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) if (list_empty(&zone->unaccepted_pages)) return false; - /* How much to accept to get to high watermark? */ - to_accept = high_wmark_pages(zone) - + /* How much to accept to get to promo watermark? */ + to_accept = promo_wmark_pages(zone) - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0) - zone_page_state(zone, NR_UNACCEPTED)); @@ -6997,11 +7099,6 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order) return ret; } -static inline bool has_unaccepted_memory(void) -{ - return static_branch_unlikely(&zones_with_unaccepted_pages); -} - static bool __free_unaccepted(struct page *page) { struct zone *zone = page_zone(page); @@ -7016,6 +7113,7 @@ static bool __free_unaccepted(struct page *page) list_add_tail(&page->lru, &zone->unaccepted_pages); account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); + __SetPageUnaccepted(page); spin_unlock_irqrestore(&zone->lock, flags); if (first) @@ -7031,20 +7129,11 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order) return false; } -static void accept_page(struct page *page, unsigned int order) -{ -} - static bool cond_accept_memory(struct zone *zone, unsigned int order) { return false; } -static inline bool has_unaccepted_memory(void) -{ - return false; -} - static bool __free_unaccepted(struct page *page) { BUILD_BUG(); diff --git a/mm/page_counter.c b/mm/page_counter.c index 0153f5bb3161..b249d15af9dd 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,11 @@ #include <linux/bug.h> #include <asm/page.h> +static bool track_protection(struct page_counter *c) +{ + return c->protection_support; +} + static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { @@ -57,7 +62,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) new = 0; atomic_long_set(&counter->usage, new); } - propagate_protected_usage(counter, new); + if (track_protection(counter)) + propagate_protected_usage(counter, new); } /** @@ -70,18 +76,33 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; + bool protection = track_protection(counter); for (c = counter; c; c = c->parent) { long new; new = atomic_long_add_return(nr_pages, &c->usage); - propagate_protected_usage(c, new); + if (protection) + propagate_protected_usage(c, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. + * + * Notably, we have two watermarks to allow for both a globally + * visible peak and one that can be reset at a smaller scope. + * + * Since we reset both watermarks when the global reset occurs, + * we can guarantee that watermark >= local_watermark, so we + * don't need to do both comparisons every time. + * + * On systems with branch predictors, the inner condition should + * be almost free. */ - if (new > READ_ONCE(c->watermark)) - WRITE_ONCE(c->watermark, new); + if (new > READ_ONCE(c->local_watermark)) { + WRITE_ONCE(c->local_watermark, new); + if (new > READ_ONCE(c->watermark)) + WRITE_ONCE(c->watermark, new); + } } } @@ -99,6 +120,7 @@ bool page_counter_try_charge(struct page_counter *counter, struct page_counter **fail) { struct page_counter *c; + bool protection = track_protection(counter); for (c = counter; c; c = c->parent) { long new; @@ -128,13 +150,15 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } - propagate_protected_usage(c, new); - /* - * Just like with failcnt, we can live with some - * inaccuracy in the watermark. - */ - if (new > READ_ONCE(c->watermark)) - WRITE_ONCE(c->watermark, new); + if (protection) + propagate_protected_usage(c, new); + + /* see comment on page_counter_charge */ + if (new > READ_ONCE(c->local_watermark)) { + WRITE_ONCE(c->local_watermark, new); + if (new > READ_ONCE(c->watermark)) + WRITE_ONCE(c->watermark, new); + } } return true; @@ -264,6 +288,7 @@ int page_counter_memparse(const char *buf, const char *max, } +#ifdef CONFIG_MEMCG /* * This function calculates an individual page counter's effective * protection which is derived from its own memory.min/low, its @@ -435,3 +460,4 @@ void page_counter_calculate_protection(struct page_counter *root, atomic_long_read(&parent->children_low_usage), recursive_protection)); } +#endif /* CONFIG_MEMCG */ diff --git a/mm/page_io.c b/mm/page_io.c index ff8c99ee3af7..78bc88acee79 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -172,6 +172,60 @@ bad_bmap: goto out; } +static bool is_folio_zero_filled(struct folio *folio) +{ + unsigned int pos, last_pos; + unsigned long *data; + unsigned int i; + + last_pos = PAGE_SIZE / sizeof(*data) - 1; + for (i = 0; i < folio_nr_pages(folio); i++) { + data = kmap_local_folio(folio, i * PAGE_SIZE); + /* + * Check last word first, incase the page is zero-filled at + * the start and has non-zero data at the end, which is common + * in real-world workloads. + */ + if (data[last_pos]) { + kunmap_local(data); + return false; + } + for (pos = 0; pos < last_pos; pos++) { + if (data[pos]) { + kunmap_local(data); + return false; + } + } + kunmap_local(data); + } + + return true; +} + +static void swap_zeromap_folio_set(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + set_bit(swp_offset(entry), sis->zeromap); + } +} + +static void swap_zeromap_folio_clear(struct folio *folio) +{ + struct swap_info_struct *sis = swp_swap_info(folio->swap); + swp_entry_t entry; + unsigned int i; + + for (i = 0; i < folio_nr_pages(folio); i++) { + entry = page_swap_entry(folio_page(folio, i)); + clear_bit(swp_offset(entry), sis->zeromap); + } +} + /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. @@ -195,6 +249,25 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_unlock(folio); return ret; } + + /* + * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages. + * The bits in zeromap are protected by the locked swapcache folio + * and atomic updates are used to protect against read-modify-write + * corruption due to other zero swap entries seeing concurrent updates. + */ + if (is_folio_zero_filled(folio)) { + swap_zeromap_folio_set(folio); + folio_unlock(folio); + return 0; + } else { + /* + * Clear bits this folio occupies in the zeromap to prevent + * zero data being read in from any previous zero writes that + * occupied the same swap entries. + */ + swap_zeromap_folio_clear(folio); + } if (zswap_store(folio)) { folio_unlock(folio); return 0; @@ -273,9 +346,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) * memory for allocating transmit buffers. * Mark the page dirty and avoid * folio_rotate_reclaimable but rate-limit the - * messages but do not flag PageError like - * the normal direct-to-bio case as it could - * be temporary. + * messages. */ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", ret, swap_dev_pos(page_swap_entry(page))); @@ -429,6 +500,28 @@ static void sio_read_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } +static bool swap_read_folio_zeromap(struct folio *folio) +{ + int nr_pages = folio_nr_pages(folio); + bool is_zeromap; + + /* + * Swapping in a large folio that is partially in the zeromap is not + * currently handled. Return true without marking the folio uptodate so + * that an IO error is emitted (e.g. do_swap_page() will sigbus). + */ + if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages, + &is_zeromap) != nr_pages)) + return true; + + if (!is_zeromap) + return false; + + folio_zero_range(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); + return true; +} + static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { struct swap_info_struct *sis = swp_swap_info(folio->swap); @@ -519,9 +612,18 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) } delayacct_swapin_start(); - if (zswap_load(folio)) { + if (swap_read_folio_zeromap(folio)) { + folio_unlock(folio); + goto finish; + } else if (zswap_load(folio)) { folio_unlock(folio); - } else if (data_race(sis->flags & SWP_FS_OPS)) { + goto finish; + } + + /* We have to read from slower devices. Increase zswap protection. */ + zswap_folio_swapin(folio); + + if (data_race(sis->flags & SWP_FS_OPS)) { swap_read_folio_fs(folio, plug); } else if (synchronous) { swap_read_folio_bdev_sync(folio, sis); @@ -529,6 +631,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) swap_read_folio_bdev_async(folio, sis); } +finish: if (workingset) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 042937d5abe4..7e04047977cf 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -152,6 +152,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ unsigned long flags; unsigned long check_unmovable_start, check_unmovable_end; + if (PageUnaccepted(page)) + accept_page(page); + spin_lock_irqsave(&zone->lock, flags); /* @@ -367,6 +370,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, VM_BUG_ON(!page); pfn = page_to_pfn(page); + if (PageUnaccepted(page)) { + pfn += MAX_ORDER_NR_PAGES; + continue; + } + if (PageBuddy(page)) { int order = buddy_order(page); @@ -395,30 +403,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, unsigned long head_pfn = page_to_pfn(head); unsigned long nr_pages = compound_nr(head); - if (head_pfn + nr_pages <= boundary_pfn) { - pfn = head_pfn + nr_pages; - continue; - } - -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - if (PageHuge(page)) { - int page_mt = get_pageblock_migratetype(page); - struct compact_control cc = { - .nr_migratepages = 0, - .order = -1, - .zone = page_zone(pfn_to_page(head_pfn)), - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - .no_set_skip_hint = true, - .gfp_mask = gfp_flags, - .alloc_contig = true, - }; - INIT_LIST_HEAD(&cc.migratepages); - - ret = __alloc_contig_migrate_range(&cc, head_pfn, - head_pfn + nr_pages, page_mt); - if (ret) - goto failed; + if (head_pfn + nr_pages <= boundary_pfn || + PageHuge(page)) { pfn = head_pfn + nr_pages; continue; } @@ -432,7 +418,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, */ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page); VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page); -#endif + goto failed; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index ae2f08ce991b..461ea3bbd8d9 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,6 +3,8 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/swapops.h> /* * We want to know the real level where a entry is located ignoring any @@ -654,3 +656,203 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, return err; } + +/** + * folio_walk_start - walk the page tables to a folio + * @fw: filled with information on success. + * @vma: the VMA. + * @addr: the virtual address to use for the page table walk. + * @flags: flags modifying which folios to walk to. + * + * Walk the page tables using @addr in a given @vma to a mapped folio and + * return the folio, making sure that the page table entry referenced by + * @addr cannot change until folio_walk_end() was called. + * + * As default, this function returns only folios that are not special (e.g., not + * the zeropage) and never returns folios that are supposed to be ignored by the + * VM as documented by vm_normal_page(). If requested, zeropages will be + * returned as well. + * + * As default, this function only considers present page table entries. + * If requested, it will also consider migration entries. + * + * If this function returns NULL it might either indicate "there is nothing" or + * "there is nothing suitable". + * + * On success, @fw is filled and the function returns the folio while the PTL + * is still held and folio_walk_end() must be called to clean up, + * releasing any held locks. The returned folio must *not* be used after the + * call to folio_walk_end(), unless a short-term folio reference is taken before + * that call. + * + * @fw->page will correspond to the page that is effectively referenced by + * @addr. However, for migration entries and shared zeropages @fw->page is + * set to NULL. Note that large folios might be mapped by multiple page table + * entries, and this function will always only lookup a single entry as + * specified by @addr, which might or might not cover more than a single page of + * the returned folio. + * + * This function must *not* be used as a naive replacement for + * get_user_pages() / pin_user_pages(), especially not to perform DMA or + * to carelessly modify page content. This function may *only* be used to grab + * short-term folio references, never to grab long-term folio references. + * + * Using the page table entry pointers in @fw for reading or modifying the + * entry should be avoided where possible: however, there might be valid + * use cases. + * + * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. + * For example, PMD page table sharing might require prior unsharing. Also, + * logical hugetlb entries might span multiple physical page table entries, + * which *must* be modified in a single operation (set_huge_pte_at(), + * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might + * not correspond to the first physical entry of a logical hugetlb entry. + * + * The mmap lock must be held in read mode. + * + * Return: folio pointer on success, otherwise NULL. + */ +struct folio *folio_walk_start(struct folio_walk *fw, + struct vm_area_struct *vma, unsigned long addr, + folio_walk_flags_t flags) +{ + unsigned long entry_size; + bool expose_page = true; + struct page *page; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + pgd_t *pgdp; + p4d_t *p4dp; + + mmap_assert_locked(vma->vm_mm); + vma_pgtable_walk_begin(vma); + + if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) + goto not_found; + + pgdp = pgd_offset(vma->vm_mm, addr); + if (pgd_none_or_clear_bad(pgdp)) + goto not_found; + + p4dp = p4d_offset(pgdp, addr); + if (p4d_none_or_clear_bad(p4dp)) + goto not_found; + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + if (pud_none(pud)) + goto not_found; + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { + ptl = pud_lock(vma->vm_mm, pudp); + pud = pudp_get(pudp); + + entry_size = PUD_SIZE; + fw->level = FW_LEVEL_PUD; + fw->pudp = pudp; + fw->pud = pud; + + if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { + spin_unlock(ptl); + goto not_found; + } else if (!pud_leaf(pud)) { + spin_unlock(ptl); + goto pmd_table; + } + /* + * TODO: vm_normal_page_pud() will be handy once we want to + * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. + */ + page = pud_page(pud); + goto found; + } + +pmd_table: + VM_WARN_ON_ONCE(pud_leaf(*pudp)); + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd)) + goto not_found; + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { + ptl = pmd_lock(vma->vm_mm, pmdp); + pmd = pmdp_get(pmdp); + + entry_size = PMD_SIZE; + fw->level = FW_LEVEL_PMD; + fw->pmdp = pmdp; + fw->pmd = pmd; + + if (pmd_none(pmd)) { + spin_unlock(ptl); + goto not_found; + } else if (!pmd_leaf(pmd)) { + spin_unlock(ptl); + goto pte_table; + } else if (pmd_present(pmd)) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page) { + goto found; + } else if ((flags & FW_ZEROPAGE) && + is_huge_zero_pmd(pmd)) { + page = pfn_to_page(pmd_pfn(pmd)); + expose_page = false; + goto found; + } + } else if ((flags & FW_MIGRATION) && + is_pmd_migration_entry(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + page = pfn_swap_entry_to_page(entry); + expose_page = false; + goto found; + } + spin_unlock(ptl); + goto not_found; + } + +pte_table: + VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); + ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); + if (!ptep) + goto not_found; + pte = ptep_get(ptep); + + entry_size = PAGE_SIZE; + fw->level = FW_LEVEL_PTE; + fw->ptep = ptep; + fw->pte = pte; + + if (pte_present(pte)) { + page = vm_normal_page(vma, addr, pte); + if (page) + goto found; + if ((flags & FW_ZEROPAGE) && + is_zero_pfn(pte_pfn(pte))) { + page = pfn_to_page(pte_pfn(pte)); + expose_page = false; + goto found; + } + } else if (!pte_none(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if ((flags & FW_MIGRATION) && + is_migration_entry(entry)) { + page = pfn_swap_entry_to_page(entry); + expose_page = false; + goto found; + } + } + pte_unmap_unlock(ptep, ptl); +not_found: + vma_pgtable_walk_end(vma); + return NULL; +found: + if (expose_page) + /* Note: Offset from the mapped page, not the folio start. */ + fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); + else + fw->page = NULL; + fw->ptl = ptl; + return page_folio(page); +} diff --git a/mm/percpu.c b/mm/percpu.c index 20d91af8c033..da21680ff294 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2217,37 +2217,6 @@ static void pcpu_balance_workfn(struct work_struct *work) } /** - * pcpu_alloc_size - the size of the dynamic percpu area - * @ptr: pointer to the dynamic percpu area - * - * Returns the size of the @ptr allocation. This is undefined for statically - * defined percpu variables as there is no corresponding chunk->bound_map. - * - * RETURNS: - * The size of the dynamic percpu area. - * - * CONTEXT: - * Can be called from atomic context. - */ -size_t pcpu_alloc_size(void __percpu *ptr) -{ - struct pcpu_chunk *chunk; - unsigned long bit_off, end; - void *addr; - - if (!ptr) - return 0; - - addr = __pcpu_ptr_to_addr(ptr); - /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */ - chunk = pcpu_chunk_addr_search(addr); - bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE; - end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), - bit_off + 1); - return (end - bit_off) * PCPU_MIN_ALLOC_SIZE; -} - -/** * free_percpu - free percpu area * @ptr: pointer to area to free * diff --git a/mm/rmap.c b/mm/rmap.c index 2490e727e2dc..a8797d1b3d49 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -75,6 +75,7 @@ #include <linux/memremap.h> #include <linux/userfaultfd_k.h> #include <linux/mm_inline.h> +#include <linux/oom.h> #include <asm/tlbflush.h> @@ -870,6 +871,20 @@ static bool folio_referenced_one(struct folio *folio, continue; } + /* + * Skip the non-shared swapbacked folio mapped solely by + * the exiting or OOM-reaped process. This avoids redundant + * swap-out followed by an immediate unmap. + */ + if ((!atomic_read(&vma->vm_mm->mm_users) || + check_stable_address_space(vma->vm_mm)) && + folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_likely_mapped_shared(folio)) { + pra->referenced = -1; + page_vma_mapped_walk_done(&pvmw); + return false; + } + if (pvmw.pte) { if (lru_gen_enabled() && pte_young(ptep_get(pvmw.pte))) { @@ -1143,25 +1158,25 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; - int first, nr = 0; + int first = 0, nr = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { - nr = atomic_inc_and_test(&page->_mapcount); + nr = atomic_inc_and_test(&folio->_mapcount); break; } do { - first = atomic_inc_and_test(&page->_mapcount); - if (first) { - first = atomic_inc_return_relaxed(mapped); - if (first < ENTIRELY_MAPPED) - nr++; - } + first += atomic_inc_and_test(&page->_mapcount); } while (page++, --nr_pages > 0); + + if (first && + atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED) + nr = first; + atomic_add(orig_nr_pages, &folio->_large_mapcount); break; case RMAP_LEVEL_PMD: @@ -1452,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } __folio_mod_stat(folio, nr, nr_pmdmapped); + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); } static __always_inline void __folio_add_file_rmap(struct folio *folio, @@ -1512,7 +1528,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; - int last, nr = 0, nr_pmdmapped = 0; + int last = 0, nr = 0, nr_pmdmapped = 0; bool partially_mapped = false; __folio_rmap_sanity_checks(folio, page, nr_pages, level); @@ -1520,20 +1536,19 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { - nr = atomic_add_negative(-1, &page->_mapcount); + nr = atomic_add_negative(-1, &folio->_mapcount); break; } atomic_sub(nr_pages, &folio->_large_mapcount); do { - last = atomic_add_negative(-1, &page->_mapcount); - if (last) { - last = atomic_dec_return_relaxed(mapped); - if (last < ENTIRELY_MAPPED) - nr++; - } + last += atomic_add_negative(-1, &page->_mapcount); } while (page++, --nr_pages > 0); + if (last && + atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED) + nr = last; + partially_mapped = nr && atomic_read(mapped); break; case RMAP_LEVEL_PMD: @@ -1553,22 +1568,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, } } - partially_mapped = nr < nr_pmdmapped; + partially_mapped = nr && nr < nr_pmdmapped; break; } - if (nr) { - /* - * Queue anon large folio for deferred split if at least one - * page of the folio is unmapped and at least one page - * is still mapped. - * - * Check partially_mapped first to ensure it is a large folio. - */ - if (folio_test_anon(folio) && partially_mapped && - list_empty(&folio->_deferred_list)) - deferred_split_folio(folio); - } + /* + * Queue anon large folio for deferred split if at least one page of + * the folio is unmapped and at least one page is still mapped. + * + * Check partially_mapped first to ensure it is a large folio. + */ + if (partially_mapped && folio_test_anon(folio) && + !folio_test_partially_mapped(folio)) + deferred_split_folio(folio, true); + __folio_mod_stat(folio, -nr, -nr_pmdmapped); /* diff --git a/mm/shmem.c b/mm/shmem.c index b875852df51f..6eff8771d9cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -155,7 +155,7 @@ static unsigned long shmem_default_max_inodes(void) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct mm_struct *fault_mm, vm_fault_t *fault_type); + struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -502,8 +502,8 @@ static int shmem_replace_entry(struct address_space *mapping, * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back from swap by a racing thread. * - * Checking page is not enough: by the time a SwapCache page is locked, it - * might be reused, and again be SwapCache, using the same swap as before. + * Checking folio is not enough: by the time a swapcache folio is locked, it + * might be reused, and again be swapcache, using the same swap as before. */ static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) @@ -548,10 +548,12 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool __shmem_is_huge(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, - unsigned long vm_flags) +static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) { + struct mm_struct *mm = vma ? vma->vm_mm : NULL; loff_t i_size; if (!S_ISREG(inode->i_mode)) @@ -568,7 +570,8 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, return true; case SHMEM_HUGE_WITHIN_SIZE: index = round_up(index + 1, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= index) return true; fallthrough; @@ -581,14 +584,15 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index, } } -bool shmem_is_huge(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, - unsigned long vm_flags) +static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, unsigned long vm_flags) { if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) return false; - return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags); + return __shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); } #if defined(CONFIG_SYSFS) @@ -634,15 +638,14 @@ static const char *shmem_format_huge(int huge) #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) + struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; - LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; - int split = 0; + unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -660,13 +663,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, goto next; } - /* Check if there's anything to gain */ - if (round_up(inode->i_size, PAGE_SIZE) == - round_up(inode->i_size, HPAGE_PMD_SIZE)) { - list_move(&info->shrinklist, &to_remove); - goto next; - } - list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; @@ -675,34 +671,36 @@ next: } spin_unlock(&sbinfo->shrinklist_lock); - list_for_each_safe(pos, next, &to_remove) { - info = list_entry(pos, struct shmem_inode_info, shrinklist); - inode = &info->vfs_inode; - list_del_init(&info->shrinklist); - iput(inode); - } - list_for_each_safe(pos, next, &list) { + pgoff_t next, end; + loff_t i_size; int ret; - pgoff_t index; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; - if (nr_to_split && split >= nr_to_split) + if (nr_to_free && freed >= nr_to_free) goto move_back; - index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; - folio = filemap_get_folio(inode->i_mapping, index); - if (IS_ERR(folio)) + i_size = i_size_read(inode); + folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); + if (!folio || xa_is_value(folio)) goto drop; - /* No huge page at the end of the file: nothing to split */ + /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } + /* Check if there is anything to gain from splitting */ + next = folio_next_index(folio); + end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); + if (end <= folio->index || end >= next) { + folio_put(folio); + goto drop; + } + /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. @@ -723,6 +721,7 @@ next: if (ret) goto move_back; + freed += next - end; split++; drop: list_del_init(&info->shrinklist); @@ -767,10 +766,17 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) + struct shrink_control *sc, unsigned long nr_to_free) { return 0; } + +static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, unsigned long vm_flags) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* @@ -786,7 +792,6 @@ static int shmem_add_to_page_cache(struct folio *folio, VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - VM_BUG_ON(expected && folio_test_large(folio)); folio_ref_add(folio, nr); folio->mapping = mapping; @@ -842,23 +847,27 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); xa_unlock_irq(&mapping->i_pages); - folio_put(folio); + folio_put_refs(folio, nr); BUG_ON(error); } /* - * Remove swap entry from page cache, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. Returns + * the number of pages being freed. 0 means entry not found in XArray (0 pages + * being freed). */ -static int shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +static long shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) { + int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) - return -ENOENT; - free_swap_and_cache(radix_to_swp_entry(radswap)); - return 0; + return 0; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + + return 1 << order; } /* @@ -881,7 +890,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) - swapped++; + swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { @@ -971,7 +980,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } @@ -1010,7 +1019,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } @@ -1077,14 +1086,17 @@ whole_folios: folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; + if (unfalloc) continue; - if (shmem_free_swap(mapping, indices[i], folio)) { + swaps_freed = shmem_free_swap(mapping, indices[i], folio); + if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; + nr_swaps_freed += swaps_freed; continue; } @@ -1156,7 +1168,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_is_huge(inode, 0, false, NULL, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1443,6 +1455,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); swp_entry_t swap; pgoff_t index; + int nr_pages; + bool split = false; /* * Our capabilities prevent regular writeback or sync from ever calling @@ -1461,20 +1475,33 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) goto redirty; /* - * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or - * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, - * and its shmem_writeback() needs them to be split when swapping. + * If CONFIG_THP_SWAP is not enabled, the large folio should be + * split when swapping. + * + * And shrinkage of pages beyond i_size does not split swap, so + * swapout of a large folio crossing i_size needs to split too + * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { + index = shmem_fallocend(inode, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + if ((index > folio->index && index < folio_next_index(folio)) || + !IS_ENABLED(CONFIG_THP_SWAP)) + split = true; + } + + if (split) { +try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page(page) < 0) + if (split_huge_page_to_list_to_order(page, wbc->list, 0)) goto redirty; folio = page_folio(page); folio_clear_dirty(folio); } index = folio->index; + nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC @@ -1509,8 +1536,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } swap = folio_alloc_swap(folio); - if (!swap.val) + if (!swap.val) { + if (nr_pages > 1) + goto try_split; + goto redirty; + } /* * Add inode to shmem_unuse()'s list of swapped-out inodes, @@ -1527,8 +1558,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { - shmem_recalc_inode(inode, 0, 1); - swap_shmem_alloc(swap); + shmem_recalc_inode(inode, 0, nr_pages); + swap_shmem_alloc(swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); @@ -1624,22 +1655,33 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge) + loff_t write_end, bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); - unsigned long vm_flags = vma->vm_flags; + unsigned long vm_flags = vma ? vma->vm_flags : 0; + bool global_huge; loff_t i_size; int order; - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (vma && ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return 0; /* If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) return 0; + global_huge = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + if (!vma || !vma_is_anon_shmem(vma)) { + /* + * For tmpfs, we now only support PMD sized THP if huge page + * is enabled, otherwise fallback to order 0. + */ + return global_huge ? BIT(HPAGE_PMD_ORDER) : 0; + } + /* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts. @@ -1680,20 +1722,30 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault struct address_space *mapping, pgoff_t index, unsigned long orders) { - struct vm_area_struct *vma = vmf->vma; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; pgoff_t aligned_index; unsigned long pages; int order; - orders = thp_vma_suitable_orders(vma, vmf->address, orders); - if (!orders) - return 0; + if (vma) { + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + } /* Find the highest order that can add into the page cache */ order = highest_order(orders); while (orders) { pages = 1UL << order; aligned_index = round_down(index, pages); + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ if (!xa_find(&mapping->i_pages, &aligned_index, aligned_index + pages - 1, XA_PRESENT)) break; @@ -1731,7 +1783,6 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct vm_area_struct *vma = vmf ? vmf->vma : NULL; unsigned long suitable_orders = 0; struct folio *folio = NULL; long pages; @@ -1741,26 +1792,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, orders = 0; if (orders > 0) { - if (vma && vma_is_anon_shmem(vma)) { - suitable_orders = shmem_suitable_orders(inode, vmf, + suitable_orders = shmem_suitable_orders(inode, vmf, mapping, index, orders); - } else if (orders & BIT(HPAGE_PMD_ORDER)) { - pages = HPAGE_PMD_NR; - suitable_orders = BIT(HPAGE_PMD_ORDER); - index = round_down(index, HPAGE_PMD_NR); - - /* - * Check for conflict before waiting on a huge allocation. - * Conflict might be that a huge page has just been allocated - * and added to page cache by a racing thread, or that there - * is already at least one small page in the huge extent. - * Be careful to retry when appropriate, but not forever! - * Elsewhere -EEXIST would be the right code, but not here. - */ - if (xa_find(&mapping->i_pages, &index, - index + HPAGE_PMD_NR - 1, XA_PRESENT)) - return ERR_PTR(-E2BIG); - } order = highest_order(suitable_orders); while (suitable_orders) { @@ -1772,9 +1805,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); -#endif order = next_order(&suitable_orders, order); } } else { @@ -1799,10 +1830,8 @@ allocated: count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); -#endif } goto unlock; } @@ -1819,7 +1848,7 @@ allocated: * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem. */ - shmem_unused_huge_shrink(sbinfo, NULL, 2); + shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced. @@ -1867,30 +1896,35 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + struct vm_area_struct *vma) { - struct folio *old, *new; - struct address_space *swap_mapping; - swp_entry_t entry; - pgoff_t swap_index; - int error; - - old = *foliop; - entry = old->swap; - swap_index = swap_cache_index(entry); - swap_mapping = swap_address_space(entry); + struct folio *new, *old = *foliop; + swp_entry_t entry = old->swap; + struct address_space *swap_mapping = swap_address_space(entry); + pgoff_t swap_index = swap_cache_index(entry); + XA_STATE(xas, &swap_mapping->i_pages, swap_index); + int nr_pages = folio_nr_pages(old); + int error = 0, i; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, 0, info, index); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages > 1) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } +#endif + + new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; - folio_get(new); + folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); @@ -1900,26 +1934,34 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* - * Our caller will very soon move newpage out of swapcache, but it's - * a nice clean interface for us to replace oldpage by newpage there. - */ + /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, old, new); + for (i = 0; i < nr_pages; i++) { + void *item = xas_load(&xas); + + if (item != old) { + error = -ENOENT; + break; + } + + xas_store(&xas, new); + xas_next(&xas); + } if (!error) { mem_cgroup_replace_folio(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); - __lruvec_stat_mod_folio(new, NR_SHMEM, 1); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); - __lruvec_stat_mod_folio(old, NR_SHMEM, -1); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages); + __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* - * Is this possible? I think not, now that our callers check - * both PageSwapCache and page_private after getting page lock; - * but be defensive. Reverse old to newpage for clear and free. + * Is this possible? I think not, now that our callers + * check both the swapcache flag and folio->private + * after getting the folio lock; but be defensive. + * Reverse old to newpage for clear and free. */ old = new; } else { @@ -1931,7 +1973,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, old->private = NULL; folio_unlock(old); - folio_put_refs(old, 2); + /* + * The old folio are removed from swap cache, drop the 'nr_pages' + * reference, as well as one temporary reference getting from swap + * cache. + */ + folio_put_refs(old, nr_pages + 1); return error; } @@ -1941,6 +1988,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; + int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, @@ -1949,6 +1997,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, if (old != swp_to_radix_entry(swap)) return; + nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); delete_from_swap_cache(folio); /* @@ -1956,8 +2005,86 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ - shmem_recalc_inode(inode, -1, -1); - swap_free(swap); + shmem_recalc_inode(inode, -nr_pages, -nr_pages); + swap_free_nr(swap, nr_pages); +} + +static int shmem_split_large_entry(struct inode *inode, pgoff_t index, + swp_entry_t swap, gfp_t gfp) +{ + struct address_space *mapping = inode->i_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); + void *alloced_shadow = NULL; + int alloced_order = 0, i; + + /* Convert user data gfp flags to xarray node gfp flags */ + gfp &= GFP_RECLAIM_MASK; + + for (;;) { + int order = -1, split_order = 0; + void *old = NULL; + + xas_lock_irq(&xas); + old = xas_load(&xas); + if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + + order = xas_get_order(&xas); + + /* Swap entry may have changed before we re-acquire the lock */ + if (alloced_order && + (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; + } + + /* Try to split large swap entry in pagecache */ + if (order > 0) { + if (!alloced_order) { + split_order = order; + goto unlock; + } + xas_split(&xas, old, order); + + /* + * Re-set the swap entry after splitting, and the swap + * offset of the original large entry must be continuous. + */ + for (i = 0; i < 1 << order; i++) { + pgoff_t aligned_index = round_down(index, 1 << order); + swp_entry_t tmp; + + tmp = swp_entry(swp_type(swap), swp_offset(swap) + i); + __xa_store(&mapping->i_pages, aligned_index + i, + swp_to_radix_entry(tmp), 0); + } + } + +unlock: + xas_unlock_irq(&xas); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + } + +error: + if (xas_error(&xas)) + return xas_error(&xas); + + return alloced_order; } /* @@ -1968,15 +2095,16 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct mm_struct *fault_mm, + gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; + struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; - int error; + int error, nr_pages; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -1996,12 +2124,37 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); if (!folio) { + int split_order; + /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + + /* + * Now swap device can only swap in order 0 folio, then we + * should split the large swap entry stored in the pagecache + * if necessary. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + /* Here we actually start the io */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { @@ -2023,6 +2176,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } folio_wait_writeback(folio); + nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the @@ -2031,24 +2185,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_folio(&folio, gfp, info, index); + error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } - error = shmem_add_to_page_cache(folio, mapping, index, + error = shmem_add_to_page_cache(folio, mapping, + round_down(index, nr_pages), swp_to_radix_entry(swap), gfp); if (error) goto failed; - shmem_recalc_inode(inode, 0, -1); + shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); delete_from_swap_cache(folio); folio_mark_dirty(folio); - swap_free(swap); + swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; @@ -2078,14 +2233,14 @@ unlock: * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_fault *vmf, vm_fault_t *fault_type) + loff_t write_end, struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; - bool alloced, huge; + bool alloced; unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) @@ -2111,7 +2266,7 @@ repeat: if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, fault_mm, fault_type); + sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; @@ -2158,14 +2313,8 @@ repeat: return 0; } - huge = shmem_is_huge(inode, index, false, fault_mm, - vma ? vma->vm_flags : 0); - /* Find hugepage orders that are allowed for anonymous shmem. */ - if (vma && vma_is_anon_shmem(vma)) - orders = shmem_allowable_huge_orders(inode, vma, index, huge); - else if (huge) - orders = BIT(HPAGE_PMD_ORDER); - + /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ + orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); if (orders > 0) { gfp_t huge_gfp; @@ -2176,9 +2325,7 @@ repeat: if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); -#endif goto alloced; } if (PTR_ERR(folio) == -EEXIST) @@ -2198,7 +2345,7 @@ alloced: alloced = true; if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - folio_next_index(folio) - 1) { + folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* @@ -2268,6 +2415,7 @@ unlock: * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. + * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * @@ -2287,10 +2435,10 @@ unlock: * Context: May sleep. * Return: 0 if successful, else a negative error code. */ -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp) { - return shmem_get_folio_gfp(inode, index, foliop, sgp, + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); @@ -2385,7 +2533,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } WARN_ON_ONCE(vmf->page != NULL); - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); @@ -2895,7 +3043,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; @@ -2965,7 +3113,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_get_folio(inode, index, &folio, SGP_READ); + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -3141,7 +3289,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (*ppos >= i_size_read(inode)) break; - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) @@ -3331,8 +3479,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + error = shmem_get_folio(inode, index, offset + len, + &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -3683,7 +3831,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; @@ -3729,7 +3877,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) @@ -5197,7 +5345,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c index ce514e700d2f..d1e32ac01407 100644 --- a/mm/shmem_quota.c +++ b/mm/shmem_quota.c @@ -34,8 +34,6 @@ #include <linux/quotaops.h> #include <linux/quota.h> -#ifdef CONFIG_TMPFS_QUOTA - /* * The following constants define the amount of time given a user * before the soft limits are treated as hard limits (usually resulting @@ -351,4 +349,3 @@ const struct dquot_operations shmem_quota_operations = { .mark_dirty = shmem_mark_dquot_dirty, .get_next_id = shmem_get_next_id, }; -#endif /* CONFIG_TMPFS_QUOTA */ diff --git a/mm/show_mem.c b/mm/show_mem.c index bdb439551eef..ec885a398fa0 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -435,15 +435,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) struct codetag *ct = tags[i].ct; struct alloc_tag *tag = ct_to_alloc_tag(ct); struct alloc_tag_counters counter = alloc_tag_read(tag); + char bytes[10]; + + string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes)); /* Same as alloc_tag_to_text() but w/o intermediate buffer */ if (ct->modname) - pr_notice("%12lli %8llu %s:%u [%s] func:%s\n", - counter.bytes, counter.calls, ct->filename, + pr_notice("%12s %8llu %s:%u [%s] func:%s\n", + bytes, counter.calls, ct->filename, ct->lineno, ct->modname, ct->function); else - pr_notice("%12lli %8llu %s:%u func:%s\n", - counter.bytes, counter.calls, ct->filename, + pr_notice("%12s %8llu %s:%u func:%s\n", + bytes, counter.calls, ct->filename, ct->lineno, ct->function); } } diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 12ea5486a3e9..4a85b94d12ce 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -114,7 +114,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, int nid; char kbuf[72]; - read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); + read_len = min(size, sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) return -EFAULT; kbuf[read_len] = '\0'; diff --git a/mm/slab_common.c b/mm/slab_common.c index 61f32420230a..744324465615 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1205,6 +1205,13 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) /* If the object still fits, repoison it precisely. */ if (ks >= new_size) { + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) { + kasan_disable_current(); + memset((void *)p + new_size, 0, ks - new_size); + kasan_enable_current(); + } + p = kasan_krealloc((void *)p, new_size, flags); return (void *)p; } @@ -1226,11 +1233,27 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) * @new_size: how many bytes of memory are required. * @flags: the type of memory to allocate. * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes (__GFP_ZERO flag is effectively ignored). * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * This is the case, since krealloc() only knows about the bucket size of an + * allocation (but not the exact size it was allocated with) and hence + * implements the following semantics for shrinking and growing buffers with + * __GFP_ZERO. + * + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * * Return: pointer to the allocated memory or %NULL in case of error */ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) diff --git a/mm/swap.c b/mm/swap.c index 9caf6b017cf0..835bdf324b76 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,31 +47,27 @@ int page_cluster; const int page_cluster_max = 31; -/* Protecting only lru_rotate.fbatch which requires disabling interrupts */ -struct lru_rotate { - local_lock_t lock; - struct folio_batch fbatch; -}; -static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { - .lock = INIT_LOCAL_LOCK(lock), -}; - -/* - * The following folio batches are grouped together because they are protected - * by disabling preemption (and interrupts remain enabled). - */ struct cpu_fbatches { + /* + * The following folio batches are grouped together because they are protected + * by disabling preemption (and interrupts remain enabled). + */ local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP - struct folio_batch activate; + struct folio_batch lru_activate; #endif + /* Protecting the following batches which require disabling interrupts */ + local_lock_t lock_irq; + struct folio_batch lru_move_tail; }; + static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), + .lock_irq = INIT_LOCAL_LOCK(lock_irq), }; static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, @@ -117,7 +113,9 @@ void __folio_put(struct folio *folio) if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; - } else if (folio_test_hugetlb(folio)) { + } + + if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } @@ -162,7 +160,7 @@ EXPORT_SYMBOL(put_pages_list); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); -static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_add(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); @@ -222,23 +220,50 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) folios_put(fbatch); } -static void folio_batch_add_and_move(struct folio_batch *fbatch, - struct folio *folio, move_fn_t move_fn) +static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, + struct folio *folio, move_fn_t move_fn, + bool on_lru, bool disable_irq) { - if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && - !lru_cache_disabled()) + unsigned long flags; + + if (on_lru && !folio_test_clear_lru(folio)) return; - folio_batch_move_lru(fbatch, move_fn); + + folio_get(folio); + + if (disable_irq) + local_lock_irqsave(&cpu_fbatches.lock_irq, flags); + else + local_lock(&cpu_fbatches.lock); + + if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || + lru_cache_disabled()) + folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); + + if (disable_irq) + local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); + else + local_unlock(&cpu_fbatches.lock); } -static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) +#define folio_batch_add_and_move(folio, op, on_lru) \ + __folio_batch_add_and_move( \ + &cpu_fbatches.op, \ + folio, \ + op, \ + on_lru, \ + offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ + ) + +static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) { - if (!folio_test_unevictable(folio)) { - lruvec_del_folio(lruvec, folio); - folio_clear_active(folio); - lruvec_add_folio_tail(lruvec, folio); - __count_vm_events(PGROTATED, folio_nr_pages(folio)); - } + if (folio_test_unevictable(folio)) + return; + + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + lruvec_add_folio_tail(lruvec, folio); + __count_vm_events(PGROTATED, folio_nr_pages(folio)); } /* @@ -250,22 +275,11 @@ static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) */ void folio_rotate_reclaimable(struct folio *folio) { - if (!folio_test_locked(folio) && !folio_test_dirty(folio) && - !folio_test_unevictable(folio)) { - struct folio_batch *fbatch; - unsigned long flags; - - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } + if (folio_test_locked(folio) || folio_test_dirty(folio) || + folio_test_unevictable(folio)) + return; - local_lock_irqsave(&lru_rotate.lock, flags); - fbatch = this_cpu_ptr(&lru_rotate.fbatch); - folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); - local_unlock_irqrestore(&lru_rotate.lock, flags); - } + folio_batch_add_and_move(folio, lru_move_tail, true); } void lru_note_cost(struct lruvec *lruvec, bool file, @@ -326,47 +340,38 @@ void lru_note_cost_refault(struct folio *folio) folio_nr_pages(folio), 0); } -static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_activate(struct lruvec *lruvec, struct folio *folio) { - if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { - long nr_pages = folio_nr_pages(folio); + long nr_pages = folio_nr_pages(folio); - lruvec_del_folio(lruvec, folio); - folio_set_active(folio); - lruvec_add_folio(lruvec, folio); - trace_mm_lru_activate(folio); + if (folio_test_active(folio) || folio_test_unevictable(folio)) + return; - __count_vm_events(PGACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, - nr_pages); - } + + lruvec_del_folio(lruvec, folio); + folio_set_active(folio); + lruvec_add_folio(lruvec, folio); + trace_mm_lru_activate(folio); + + __count_vm_events(PGACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { - struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); + struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu); if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, folio_activate_fn); + folio_batch_move_lru(fbatch, lru_activate); } void folio_activate(struct folio *folio) { - if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { - struct folio_batch *fbatch; - - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } + if (folio_test_active(folio) || folio_test_unevictable(folio)) + return; - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.activate); - folio_batch_add_and_move(fbatch, folio, folio_activate_fn); - local_unlock(&cpu_fbatches.lock); - } + folio_batch_add_and_move(folio, lru_activate, true); } #else @@ -378,12 +383,13 @@ void folio_activate(struct folio *folio) { struct lruvec *lruvec; - if (folio_test_clear_lru(folio)) { - lruvec = folio_lruvec_lock_irq(folio); - folio_activate_fn(lruvec, folio); - unlock_page_lruvec_irq(lruvec); - folio_set_lru(folio); - } + if (!folio_test_clear_lru(folio)) + return; + + lruvec = folio_lruvec_lock_irq(folio); + lru_activate(lruvec, folio); + unlock_page_lruvec_irq(lruvec); + folio_set_lru(folio); } #endif @@ -482,7 +488,7 @@ void folio_mark_accessed(struct folio *folio) } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via - * cpu_fbatches.activate. Otherwise, assume the folio is in a + * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ @@ -509,8 +515,6 @@ EXPORT_SYMBOL(folio_mark_accessed); */ void folio_add_lru(struct folio *folio) { - struct folio_batch *fbatch; - VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); @@ -520,11 +524,7 @@ void folio_add_lru(struct folio *folio) lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); - folio_get(folio); - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); - folio_batch_add_and_move(fbatch, folio, lru_add_fn); - local_unlock(&cpu_fbatches.lock); + folio_batch_add_and_move(folio, lru_add, false); } EXPORT_SYMBOL(folio_add_lru); @@ -567,7 +567,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ -static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio); long nr_pages = folio_nr_pages(folio); @@ -608,43 +608,43 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) } } -static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) { - if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { - long nr_pages = folio_nr_pages(folio); + long nr_pages = folio_nr_pages(folio); - lruvec_del_folio(lruvec, folio); - folio_clear_active(folio); - folio_clear_referenced(folio); - lruvec_add_folio(lruvec, folio); + if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + return; - __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, - nr_pages); - } + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + lruvec_add_folio(lruvec, folio); + + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } -static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) +static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) { - if (folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { - long nr_pages = folio_nr_pages(folio); + long nr_pages = folio_nr_pages(folio); - lruvec_del_folio(lruvec, folio); - folio_clear_active(folio); - folio_clear_referenced(folio); - /* - * Lazyfree folios are clean anonymous folios. They have - * the swapbacked flag cleared, to distinguish them from normal - * anonymous folios - */ - folio_clear_swapbacked(folio); - lruvec_add_folio(lruvec, folio); + if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + folio_test_swapcache(folio) || folio_test_unevictable(folio)) + return; - __count_vm_events(PGLAZYFREE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, - nr_pages); - } + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + /* + * Lazyfree folios are clean anonymous folios. They have + * the swapbacked flag cleared, to distinguish them from normal + * anonymous folios + */ + folio_clear_swapbacked(folio); + lruvec_add_folio(lruvec, folio); + + __count_vm_events(PGLAZYFREE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* @@ -658,30 +658,30 @@ void lru_add_drain_cpu(int cpu) struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, lru_add_fn); + folio_batch_move_lru(fbatch, lru_add); - fbatch = &per_cpu(lru_rotate.fbatch, cpu); + fbatch = &fbatches->lru_move_tail; /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_lock_irqsave(&lru_rotate.lock, flags); - folio_batch_move_lru(fbatch, lru_move_tail_fn); - local_unlock_irqrestore(&lru_rotate.lock, flags); + local_lock_irqsave(&cpu_fbatches.lock_irq, flags); + folio_batch_move_lru(fbatch, lru_move_tail); + local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, lru_deactivate_file_fn); + folio_batch_move_lru(fbatch, lru_deactivate_file); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, lru_deactivate_fn); + folio_batch_move_lru(fbatch, lru_deactivate); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) - folio_batch_move_lru(fbatch, lru_lazyfree_fn); + folio_batch_move_lru(fbatch, lru_lazyfree); folio_activate_drain(cpu); } @@ -698,22 +698,11 @@ void lru_add_drain_cpu(int cpu) */ void deactivate_file_folio(struct folio *folio) { - struct folio_batch *fbatch; - /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } - - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); - folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); - local_unlock(&cpu_fbatches.lock); + folio_batch_add_and_move(folio, lru_deactivate_file, true); } /* @@ -726,21 +715,10 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (!folio_test_unevictable(folio) && (folio_test_active(folio) || - lru_gen_enabled())) { - struct folio_batch *fbatch; - - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } + if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + return; - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); - folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); - local_unlock(&cpu_fbatches.lock); - } + folio_batch_add_and_move(folio, lru_deactivate, true); } /** @@ -752,21 +730,11 @@ void folio_deactivate(struct folio *folio) */ void folio_mark_lazyfree(struct folio *folio) { - if (folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { - struct folio_batch *fbatch; - - folio_get(folio); - if (!folio_test_clear_lru(folio)) { - folio_put(folio); - return; - } + if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + folio_test_swapcache(folio) || folio_test_unevictable(folio)) + return; - local_lock(&cpu_fbatches.lock); - fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); - folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); - local_unlock(&cpu_fbatches.lock); - } + folio_batch_add_and_move(folio, lru_lazyfree, true); } void lru_add_drain(void) @@ -816,11 +784,11 @@ static bool cpu_needs_drain(unsigned int cpu) /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || - data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || + folio_batch_count(&fbatches->lru_move_tail) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || - folio_batch_count(&fbatches->activate) || + folio_batch_count(&fbatches->lru_activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } @@ -938,8 +906,8 @@ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling - * a list of pages to be migrated using isolate_lru_page(). - * It drains pages on LRU cache and then disable on all cpus until + * a list of folios to be migrated using folio_isolate_lru(). + * It drains folios on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). diff --git a/mm/swap.h b/mm/swap.h index baa1fa946b34..ad2f121de970 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -59,7 +59,7 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry); +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); struct folio *filemap_get_incore_folio(struct address_space *mapping, @@ -73,13 +73,39 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags, bool skip_if_exists); struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); -struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); +struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); static inline unsigned int folio_swap_flags(struct folio *folio) { return swp_swap_info(folio->swap)->flags; } + +/* + * Return the count of contiguous swap entries that share the same + * zeromap status as the starting entry. If is_zeromap is not NULL, + * it will return the zeromap status of the starting entry. + */ +static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *is_zeromap) +{ + struct swap_info_struct *sis = swp_swap_info(entry); + unsigned long start = swp_offset(entry); + unsigned long end = start + max_nr; + bool first_bit; + + first_bit = test_bit(start, sis->zeromap); + if (is_zeromap) + *is_zeromap = first_bit; + + if (max_nr <= 1) + return max_nr; + if (first_bit) + return find_next_zero_bit(sis->zeromap, end, start) - start; + else + return find_next_bit(sis->zeromap, end, start) - start; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -109,7 +135,7 @@ static inline struct folio *swap_cluster_readahead(swp_entry_t entry, return NULL; } -static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, +static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_fault *vmf) { return NULL; @@ -120,7 +146,7 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } -static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { } @@ -171,5 +197,13 @@ static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; } + +static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, + bool *has_zeromap) +{ + return 0; +} + #endif /* CONFIG_SWAP */ + #endif /* _MM_SWAP_H */ diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index db6c4a26cf59..da1278f0563b 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -161,6 +161,8 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { + if (mem_cgroup_disabled()) + return 0; return lookup_swap_cgroup(ent, NULL)->id; } diff --git a/mm/swap_state.c b/mm/swap_state.c index a1726e49a5eb..4669f29cf555 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -435,6 +435,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, { struct swap_info_struct *si; struct folio *folio; + struct folio *new_folio = NULL; + struct folio *result = NULL; void *shadow = NULL; *new_page_allocated = false; @@ -463,27 +465,28 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * else swap_off will be aborted if we return NULL. */ if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) - goto fail_put_swap; + goto put_and_return; /* - * Get a new folio to read into from swap. Allocate it now, - * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will - * cause any racers to loop around until we add it to cache. + * Get a new folio to read into from swap. Allocate it now if + * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, + * when -EEXIST will cause any racers to loop around until we + * add it to cache. */ - folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); - if (!folio) - goto fail_put_swap; + if (!new_folio) { + new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); + if (!new_folio) + goto put_and_return; + } /* * Swap entry may have been freed since our caller observed it. */ - err = swapcache_prepare(entry); + err = swapcache_prepare(entry, 1); if (!err) break; - - folio_put(folio); - if (err != -EEXIST) - goto fail_put_swap; + else if (err != -EEXIST) + goto put_and_return; /* * Protect against a recursive call to __read_swap_cache_async() @@ -494,7 +497,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * __read_swap_cache_async() in the writeback path. */ if (skip_if_exists) - goto fail_put_swap; + goto put_and_return; /* * We might race against __delete_from_swap_cache(), and @@ -509,36 +512,37 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * The swap entry is ours to swap in. Prepare the new folio. */ + __folio_set_locked(new_folio); + __folio_set_swapbacked(new_folio); - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - mem_cgroup_swapin_uncharge_swap(entry); + mem_cgroup_swapin_uncharge_swap(entry, 1); if (shadow) - workingset_refault(folio, shadow); + workingset_refault(new_folio, shadow); - /* Caller will initiate read into locked folio */ - folio_add_lru(folio); + /* Caller will initiate read into locked new_folio */ + folio_add_lru(new_folio); *new_page_allocated = true; + folio = new_folio; got_folio: - put_swap_device(si); - return folio; + result = folio; + goto put_and_return; fail_unlock: - put_swap_folio(folio, entry); - folio_unlock(folio); - folio_put(folio); -fail_put_swap: + put_swap_folio(new_folio, entry); + folio_unlock(new_folio); +put_and_return: put_swap_device(si); - return NULL; + if (!(*new_page_allocated) && new_folio) + folio_put(new_folio); + return result; } /* @@ -698,10 +702,8 @@ skip: /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (unlikely(page_allocated)) { - zswap_folio_swapin(folio); + if (unlikely(page_allocated)) swap_read_folio(folio, NULL); - } return folio; } @@ -850,10 +852,8 @@ skip: /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); - if (unlikely(page_allocated)) { - zswap_folio_swapin(folio); + if (unlikely(page_allocated)) swap_read_folio(folio, NULL); - } return folio; } @@ -863,13 +863,13 @@ skip: * @gfp_mask: memory allocation flags * @vmf: fault information * - * Returns the struct page for entry and addr, after queueing swapin. + * Returns the struct folio for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ -struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, +struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; @@ -882,9 +882,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); - if (!folio) - return NULL; - return folio_file_page(folio, swp_offset(entry)); + return folio; } #ifdef CONFIG_SYSFS diff --git a/mm/swapfile.c b/mm/swapfile.c index 38bdc439651a..0cded32414a1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,6 +53,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, + unsigned int nr_pages); +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries); +static bool folio_swapcache_freeable(struct folio *folio); +static struct swap_cluster_info *lock_cluster_or_swap_info( + struct swap_info_struct *si, unsigned long offset); +static void unlock_cluster_or_swap_info(struct swap_info_struct *si, + struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -127,8 +136,44 @@ static inline unsigned char swap_count(unsigned char ent) * corresponding page */ #define TTRS_UNMAPPED 0x2 -/* Reclaim the swap entry if swap is getting full*/ +/* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 +/* Reclaim directly, bypass the slot cache and don't touch device lock */ +#define TTRS_DIRECT 0x8 + +static bool swap_is_has_cache(struct swap_info_struct *si, + unsigned long offset, int nr_pages) +{ + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + + do { + VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); + if (*map != SWAP_HAS_CACHE) + return false; + } while (++map < map_end); + + return true; +} + +static bool swap_is_last_map(struct swap_info_struct *si, + unsigned long offset, int nr_pages, bool *has_cache) +{ + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + unsigned char count = *map; + + if (swap_count(count) != 1) + return false; + + while (++map < map_end) { + if (*map != count) + return false; + } + + *has_cache = !!(count & SWAP_HAS_CACHE); + return true; +} /* * returns number of pages in the folio that backs the swap entry. If positive, @@ -139,12 +184,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); + struct address_space *address_space = swap_address_space(entry); + struct swap_cluster_info *ci; struct folio *folio; - int ret = 0; + int ret, nr_pages; + bool need_reclaim; - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); + folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0; + + /* offset could point to the middle of a large folio */ + entry = folio->swap; + offset = swp_offset(entry); + nr_pages = folio_nr_pages(folio); + ret = -nr_pages; + /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock @@ -152,14 +207,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (folio_trylock(folio)) { - if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) - ret = folio_free_swap(folio); - folio_unlock(folio); + if (!folio_trylock(folio)) + goto out; + + need_reclaim = ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); + if (!need_reclaim || !folio_swapcache_freeable(folio)) + goto out_unlock; + + /* + * It's safe to delete the folio from swap cache only if the folio's + * swap_map is HAS_CACHE only, which means the slots have no page table + * reference or pending writeback, and can't be allocated to others. + */ + ci = lock_cluster_or_swap_info(si, offset); + need_reclaim = swap_is_has_cache(si, offset, nr_pages); + unlock_cluster_or_swap_info(si, ci); + if (!need_reclaim) + goto out_unlock; + + if (!(flags & TTRS_DIRECT)) { + /* Free through slot cache */ + delete_from_swap_cache(folio); + folio_set_dirty(folio); + ret = nr_pages; + goto out_unlock; } - ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); + + xa_lock_irq(&address_space->i_pages); + __delete_from_swap_cache(folio, entry, NULL); + xa_unlock_irq(&address_space->i_pages); + folio_ref_sub(folio, nr_pages); + folio_set_dirty(folio); + + spin_lock(&si->lock); + /* Only sinple page folio can be backed by zswap */ + if (nr_pages == 1) + zswap_invalidate(entry); + swap_entry_range_free(si, entry, nr_pages); + spin_unlock(&si->lock); + ret = nr_pages; +out_unlock: + folio_unlock(folio); +out: folio_put(folio); return ret; } @@ -290,62 +381,21 @@ static void discard_swap_cluster(struct swap_info_struct *si, #endif #define LATENCY_LIMIT 256 -static inline void cluster_set_flag(struct swap_cluster_info *info, - unsigned int flag) -{ - info->flags = flag; -} - -static inline unsigned int cluster_count(struct swap_cluster_info *info) -{ - return info->data; -} - -static inline void cluster_set_count(struct swap_cluster_info *info, - unsigned int c) -{ - info->data = c; -} - -static inline void cluster_set_count_flag(struct swap_cluster_info *info, - unsigned int c, unsigned int f) -{ - info->flags = f; - info->data = c; -} - -static inline unsigned int cluster_next(struct swap_cluster_info *info) -{ - return info->data; -} - -static inline void cluster_set_next(struct swap_cluster_info *info, - unsigned int n) -{ - info->data = n; -} - -static inline void cluster_set_next_flag(struct swap_cluster_info *info, - unsigned int n, unsigned int f) -{ - info->flags = f; - info->data = n; -} - static inline bool cluster_is_free(struct swap_cluster_info *info) { return info->flags & CLUSTER_FLAG_FREE; } -static inline bool cluster_is_null(struct swap_cluster_info *info) +static inline unsigned int cluster_index(struct swap_info_struct *si, + struct swap_cluster_info *ci) { - return info->flags & CLUSTER_FLAG_NEXT_NULL; + return ci - si->cluster_info; } -static inline void cluster_set_null(struct swap_cluster_info *info) +static inline unsigned int cluster_offset(struct swap_info_struct *si, + struct swap_cluster_info *ci) { - info->flags = CLUSTER_FLAG_NEXT_NULL; - info->data = 0; + return cluster_index(si, ci) * SWAPFILE_CLUSTER; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, @@ -394,65 +444,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, spin_unlock(&si->lock); } -static inline bool cluster_list_empty(struct swap_cluster_list *list) -{ - return cluster_is_null(&list->head); -} - -static inline unsigned int cluster_list_first(struct swap_cluster_list *list) -{ - return cluster_next(&list->head); -} - -static void cluster_list_init(struct swap_cluster_list *list) -{ - cluster_set_null(&list->head); - cluster_set_null(&list->tail); -} - -static void cluster_list_add_tail(struct swap_cluster_list *list, - struct swap_cluster_info *ci, - unsigned int idx) -{ - if (cluster_list_empty(list)) { - cluster_set_next_flag(&list->head, idx, 0); - cluster_set_next_flag(&list->tail, idx, 0); - } else { - struct swap_cluster_info *ci_tail; - unsigned int tail = cluster_next(&list->tail); - - /* - * Nested cluster lock, but both cluster locks are - * only acquired when we held swap_info_struct->lock - */ - ci_tail = ci + tail; - spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); - cluster_set_next(ci_tail, idx); - spin_unlock(&ci_tail->lock); - cluster_set_next_flag(&list->tail, idx, 0); - } -} - -static unsigned int cluster_list_del_first(struct swap_cluster_list *list, - struct swap_cluster_info *ci) -{ - unsigned int idx; - - idx = cluster_next(&list->head); - if (cluster_next(&list->tail) == idx) { - cluster_set_null(&list->head); - cluster_set_null(&list->tail); - } else - cluster_set_next_flag(&list->head, - cluster_next(&ci[idx]), 0); - - return idx; -} - /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, - unsigned int idx) + struct swap_cluster_info *ci) { + unsigned int idx = cluster_index(si, ci); /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't @@ -462,17 +458,23 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); - + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + list_move_tail(&ci->list, &si->discard_clusters); + ci->flags = 0; schedule_work(&si->discard_work); } -static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - struct swap_cluster_info *ci = si->cluster_info; + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); - cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); - cluster_list_add_tail(&si->free_clusters, ci, idx); + if (ci->flags) + list_move_tail(&ci->list, &si->free_clusters); + else + list_add_tail(&ci->list, &si->free_clusters); + ci->flags = CLUSTER_FLAG_FREE; + ci->order = 0; } /* @@ -481,24 +483,24 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx) */ static void swap_do_scheduled_discard(struct swap_info_struct *si) { - struct swap_cluster_info *info, *ci; + struct swap_cluster_info *ci; unsigned int idx; - info = si->cluster_info; - - while (!cluster_list_empty(&si->discard_clusters)) { - idx = cluster_list_del_first(&si->discard_clusters, info); + while (!list_empty(&si->discard_clusters)) { + ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); + list_del(&ci->list); + idx = cluster_index(si, ci); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&si->lock); - ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); - __free_cluster(si, idx); + spin_lock(&ci->lock); + __free_cluster(si, ci); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); + spin_unlock(&ci->lock); } } @@ -521,20 +523,15 @@ static void swap_users_ref_free(struct percpu_ref *ref) complete(&si->comp); } -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - struct swap_cluster_info *ci = si->cluster_info; - - VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); - cluster_list_del_first(&si->free_clusters, ci); - cluster_set_count_flag(ci + idx, 0, 0); -} + VM_BUG_ON(ci->count != 0); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); -static void free_cluster(struct swap_info_struct *si, unsigned long idx) -{ - struct swap_cluster_info *ci = si->cluster_info + idx; + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; - VM_BUG_ON(cluster_count(ci) != 0); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed @@ -542,175 +539,371 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx) */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { - swap_cluster_schedule_discard(si, idx); + swap_cluster_schedule_discard(si, ci); return; } - __free_cluster(si, idx); + __free_cluster(si, ci); } /* - * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased by - * count. + * The cluster corresponding to page_nr will be used. The cluster will not be + * added to free cluster list and its usage counter will be increased by 1. + * Only used for initialization. */ -static void add_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr, - unsigned long count) +static void inc_cluster_info_page(struct swap_info_struct *si, + struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; if (!cluster_info) return; - if (cluster_is_free(&cluster_info[idx])) - alloc_cluster(p, idx); - VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); - cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) + count); -} + ci = cluster_info + idx; + ci->count++; -/* - * The cluster corresponding to page_nr will be used. The cluster will be - * removed from free cluster list and its usage counter will be increased by 1. - */ -static void inc_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) -{ - add_cluster_info_page(p, cluster_info, page_nr, 1); + VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); + VM_BUG_ON(ci->flags); } /* - * The cluster corresponding to page_nr decreases one usage. If the usage - * counter becomes 0, which means no page in the cluster is in using, we can - * optionally discard the cluster and add it to free cluster list. + * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, + * which means no page in the cluster is in use, we can optionally discard + * the cluster and add it to free cluster list. */ -static void dec_cluster_info_page(struct swap_info_struct *p, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static void dec_cluster_info_page(struct swap_info_struct *si, + struct swap_cluster_info *ci, int nr_pages) { - unsigned long idx = page_nr / SWAPFILE_CLUSTER; - - if (!cluster_info) + if (!si->cluster_info) return; - VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); - cluster_set_count(&cluster_info[idx], - cluster_count(&cluster_info[idx]) - 1); + VM_BUG_ON(ci->count < nr_pages); + VM_BUG_ON(cluster_is_free(ci)); + lockdep_assert_held(&si->lock); + lockdep_assert_held(&ci->lock); + ci->count -= nr_pages; - if (cluster_count(&cluster_info[idx]) == 0) - free_cluster(p, idx); + if (!ci->count) { + free_cluster(si, ci); + return; + } + + if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { + VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; + list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } } -/* - * It's possible scan_swap_map_slots() uses a free cluster in the middle of free - * cluster list. Avoiding such abuse to avoid list corruption. - */ -static bool -scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, - unsigned long offset, int order) +static bool cluster_reclaim_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned long end) { - struct percpu_cluster *percpu_cluster; - bool conflict; + unsigned char *map = si->swap_map; + unsigned long offset; - offset /= SWAPFILE_CLUSTER; - conflict = !cluster_list_empty(&si->free_clusters) && - offset != cluster_list_first(&si->free_clusters) && - cluster_is_free(&si->cluster_info[offset]); + spin_unlock(&ci->lock); + spin_unlock(&si->lock); - if (!conflict) - return false; + for (offset = start; offset < end; offset++) { + switch (READ_ONCE(map[offset])) { + case 0: + continue; + case SWAP_HAS_CACHE: + if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) + continue; + goto out; + default: + goto out; + } + } +out: + spin_lock(&si->lock); + spin_lock(&ci->lock); + + /* + * Recheck the range no matter reclaim succeeded or not, the slot + * could have been be freed while we are not holding the lock. + */ + for (offset = start; offset < end; offset++) + if (READ_ONCE(map[offset])) + return false; - percpu_cluster = this_cpu_ptr(si->percpu_cluster); - percpu_cluster->next[order] = SWAP_NEXT_INVALID; return true; } -static inline bool swap_range_empty(char *swap_map, unsigned int start, - unsigned int nr_pages) +static bool cluster_scan_range(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned long start, unsigned int nr_pages) { - unsigned int i; + unsigned long offset, end = start + nr_pages; + unsigned char *map = si->swap_map; + bool need_reclaim = false; - for (i = 0; i < nr_pages; i++) { - if (swap_map[start + i]) + for (offset = start; offset < end; offset++) { + switch (READ_ONCE(map[offset])) { + case 0: + continue; + case SWAP_HAS_CACHE: + if (!vm_swap_full()) + return false; + need_reclaim = true; + continue; + default: return false; + } } + if (need_reclaim) + return cluster_reclaim_range(si, ci, start, end); + return true; } +static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, + unsigned int start, unsigned char usage, + unsigned int order) +{ + unsigned int nr_pages = 1 << order; + + if (cluster_is_free(ci)) { + if (nr_pages < SWAPFILE_CLUSTER) { + list_move_tail(&ci->list, &si->nonfull_clusters[order]); + ci->flags = CLUSTER_FLAG_NONFULL; + } + ci->order = order; + } + + memset(si->swap_map + start, usage, nr_pages); + swap_range_alloc(si, start, nr_pages); + ci->count += nr_pages; + + if (ci->count == SWAPFILE_CLUSTER) { + VM_BUG_ON(!(ci->flags & + (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); + if (ci->flags & CLUSTER_FLAG_FRAG) + si->frag_cluster_nr[ci->order]--; + list_move_tail(&ci->list, &si->full_clusters); + ci->flags = CLUSTER_FLAG_FULL; + } +} + +static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset, + unsigned int *foundp, unsigned int order, + unsigned char usage) +{ + unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); + unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); + unsigned int nr_pages = 1 << order; + struct swap_cluster_info *ci; + + if (end < nr_pages) + return SWAP_NEXT_INVALID; + end -= nr_pages; + + ci = lock_cluster(si, offset); + if (ci->count + nr_pages > SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; + goto done; + } + + while (offset <= end) { + if (cluster_scan_range(si, ci, offset, nr_pages)) { + cluster_alloc_range(si, ci, offset, usage, order); + *foundp = offset; + if (ci->count == SWAPFILE_CLUSTER) { + offset = SWAP_NEXT_INVALID; + goto done; + } + offset += nr_pages; + break; + } + offset += nr_pages; + } + if (offset > end) + offset = SWAP_NEXT_INVALID; +done: + unlock_cluster(ci); + return offset; +} + +static void swap_reclaim_full_clusters(struct swap_info_struct *si) +{ + long to_scan = 1; + unsigned long offset, end; + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; + int nr_reclaim, total_reclaimed = 0; + + if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) + to_scan = si->inuse_pages / SWAPFILE_CLUSTER; + + while (!list_empty(&si->full_clusters)) { + ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); + list_move_tail(&ci->list, &si->full_clusters); + offset = cluster_offset(si, ci); + end = min(si->max, offset + SWAPFILE_CLUSTER); + to_scan--; + + while (offset < end) { + if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { + spin_unlock(&si->lock); + nr_reclaim = __try_to_reclaim_swap(si, offset, + TTRS_ANYWAY | TTRS_DIRECT); + spin_lock(&si->lock); + if (nr_reclaim > 0) { + offset += nr_reclaim; + total_reclaimed += nr_reclaim; + continue; + } else if (nr_reclaim < 0) { + offset += -nr_reclaim; + continue; + } + } + offset++; + } + if (to_scan <= 0 || total_reclaimed) + break; + } +} + /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU * too. */ -static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, - unsigned long *offset, unsigned long *scan_base, int order) +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, + unsigned char usage) { - unsigned int nr_pages = 1 << order; struct percpu_cluster *cluster; struct swap_cluster_info *ci; - unsigned int tmp, max; + unsigned int offset, found = 0; new_cluster: + lockdep_assert_held(&si->lock); cluster = this_cpu_ptr(si->percpu_cluster); - tmp = cluster->next[order]; - if (tmp == SWAP_NEXT_INVALID) { - if (!cluster_list_empty(&si->free_clusters)) { - tmp = cluster_next(&si->free_clusters.head) * - SWAPFILE_CLUSTER; - } else if (!cluster_list_empty(&si->discard_clusters)) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them, then - * reread cluster_next_cpu since we dropped si->lock - */ - swap_do_scheduled_discard(si); - *scan_base = this_cpu_read(*si->cluster_next_cpu); - *offset = *scan_base; - goto new_cluster; - } else - return false; + offset = cluster->next[order]; + if (offset) { + offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); + if (found) + goto done; } - /* - * Other CPUs can use our cluster if they can't find a free cluster, - * check if there is still free entry in the cluster, maintaining - * natural alignment. - */ - max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); - if (tmp < max) { - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (swap_range_empty(si->swap_map, tmp, nr_pages)) + if (!list_empty(&si->free_clusters)) { + ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); + VM_BUG_ON(!found); + goto done; + } + + if (order < PMD_ORDER) { + unsigned int frags = 0; + + while (!list_empty(&si->nonfull_clusters[order])) { + ci = list_first_entry(&si->nonfull_clusters[order], + struct swap_cluster_info, list); + list_move_tail(&ci->list, &si->frag_clusters[order]); + ci->flags = CLUSTER_FLAG_FRAG; + si->frag_cluster_nr[order]++; + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + frags++; + if (found) break; - tmp += nr_pages; } - unlock_cluster(ci); + + if (!found) { + /* + * Nonfull clusters are moved to frag tail if we reached + * here, count them too, don't over scan the frag list. + */ + while (frags < si->frag_cluster_nr[order]) { + ci = list_first_entry(&si->frag_clusters[order], + struct swap_cluster_info, list); + /* + * Rotate the frag list to iterate, they were all failing + * high order allocation or moved here due to per-CPU usage, + * this help keeping usable cluster ahead. + */ + list_move_tail(&ci->list, &si->frag_clusters[order]); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + frags++; + if (found) + break; + } + } } - if (tmp >= max) { - cluster->next[order] = SWAP_NEXT_INVALID; + + if (found) + goto done; + + if (!list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + swap_do_scheduled_discard(si); goto new_cluster; } - *offset = tmp; - *scan_base = tmp; - tmp += nr_pages; - cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID; - return true; + + if (order) + goto done; + + /* Order 0 stealing from higher order */ + for (int o = 1; o < SWAP_NR_ORDERS; o++) { + /* + * Clusters here have at least one usable slots and can't fail order 0 + * allocation, but reclaim may drop si->lock and race with another user. + */ + while (!list_empty(&si->frag_clusters[o])) { + ci = list_first_entry(&si->frag_clusters[o], + struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); + if (found) + goto done; + } + + while (!list_empty(&si->nonfull_clusters[o])) { + ci = list_first_entry(&si->nonfull_clusters[o], + struct swap_cluster_info, list); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, 0, usage); + if (found) + goto done; + } + } + +done: + /* Try reclaim from full clusters if device is nearfull */ + if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { + swap_reclaim_full_clusters(si); + if (!found && !order && si->pages != si->inuse_pages) + goto new_cluster; + } + + cluster->next[order] = offset; + return found; } -static void __del_from_avail_list(struct swap_info_struct *p) +static void __del_from_avail_list(struct swap_info_struct *si) { int nid; - assert_spin_locked(&p->lock); + assert_spin_locked(&si->lock); for_each_node(nid) - plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); } -static void del_from_avail_list(struct swap_info_struct *p) +static void del_from_avail_list(struct swap_info_struct *si) { spin_lock(&swap_avail_lock); - __del_from_avail_list(p); + __del_from_avail_list(si); spin_unlock(&swap_avail_lock); } @@ -731,13 +924,13 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, } } -static void add_to_avail_list(struct swap_info_struct *p) +static void add_to_avail_list(struct swap_info_struct *si) { int nid; spin_lock(&swap_avail_lock); for_each_node(nid) - plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); spin_unlock(&swap_avail_lock); } @@ -747,6 +940,14 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); + unsigned int i; + + /* + * Use atomic clear_bit operations only on zeromap instead of non-atomic + * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. + */ + for (i = 0; i < nr_entries; i++) + clear_bit(offset + i, si->zeromap); if (offset < si->lowest_bit) si->lowest_bit = offset; @@ -822,11 +1023,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si, return false; } +static int cluster_alloc_swap(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[], int order) +{ + int n_ret = 0; + + VM_BUG_ON(!si->cluster_info); + + while (n_ret < nr) { + unsigned long offset = cluster_alloc_swap_entry(si, order, usage); + + if (!offset) + break; + slots[n_ret++] = swp_entry(si->type, offset); + } + + return n_ret; +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { - struct swap_cluster_info *ci; unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; @@ -865,26 +1084,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return 0; } + if (si->cluster_info) + return cluster_alloc_swap(si, usage, nr, slots, order); + si->flags += SWP_SCANNING; - /* - * Use percpu scan base for SSD to reduce lock contention on - * cluster and swap cache. For HDD, sequential access is more - * important. - */ - if (si->flags & SWP_SOLIDSTATE) - scan_base = this_cpu_read(*si->cluster_next_cpu); - else - scan_base = si->cluster_next; + + /* For HDD, sequential access is more important. */ + scan_base = si->cluster_next; offset = scan_base; - /* SSD algorithm */ - if (si->cluster_info) { - if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) { - if (order > 0) - goto no_page; - goto scan; - } - } else if (unlikely(!si->cluster_nr--)) { + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -895,8 +1104,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info - * case, just handled by scan_swap_map_try_ssd_cluster() above. */ scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; @@ -924,19 +1131,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } checks: - if (si->cluster_info) { - while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) { - /* take a break if we already got some slots */ - if (n_ret) - goto done; - if (!scan_swap_map_try_ssd_cluster(si, &offset, - &scan_base, order)) { - if (order > 0) - goto no_page; - goto scan; - } - } - } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -944,13 +1138,11 @@ checks: if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; - ci = lock_cluster(si, offset); /* reuse swap entry of cache-only swap if not busy. */ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; - unlock_cluster(ci); spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); spin_lock(&si->lock); /* entry was freed successfully, try to use this again */ if (swap_was_freed > 0) @@ -959,15 +1151,12 @@ checks: } if (si->swap_map[offset]) { - unlock_cluster(ci); if (!n_ret) goto scan; else goto done; } memset(si->swap_map + offset, usage, nr_pages); - add_cluster_info_page(si, si->cluster_info, offset, nr_pages); - unlock_cluster(ci); swap_range_alloc(si, offset, nr_pages); slots[n_ret++] = swp_entry(si->type, offset); @@ -988,13 +1177,7 @@ checks: latency_ration = LATENCY_LIMIT; } - /* try to get more slots in cluster */ - if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) - goto checks; - if (order > 0) - goto done; - } else if (si->cluster_nr && !si->swap_map[++offset]) { + if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; @@ -1055,19 +1238,6 @@ no_page: return n_ret; } -static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) -{ - unsigned long offset = idx * SWAPFILE_CLUSTER; - struct swap_cluster_info *ci; - - ci = lock_cluster(si, offset); - memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); - cluster_set_count_flag(ci, 0, 0); - free_cluster(si, idx); - unlock_cluster(ci); - swap_range_free(si, offset, SWAPFILE_CLUSTER); -} - int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { int order = swap_entry_order(entry_order); @@ -1148,22 +1318,22 @@ noswap: static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { - struct swap_info_struct *p; + struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; - p = swp_swap_info(entry); - if (!p) + si = swp_swap_info(entry); + if (!si) goto bad_nofile; - if (data_race(!(p->flags & SWP_USED))) + if (data_race(!(si->flags & SWP_USED))) goto bad_device; offset = swp_offset(entry); - if (offset >= p->max) + if (offset >= si->max) goto bad_offset; - if (data_race(!p->swap_map[swp_offset(entry)])) + if (data_race(!si->swap_map[swp_offset(entry)])) goto bad_free; - return p; + return si; bad_free: pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); @@ -1196,14 +1366,14 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } -static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, +static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) { unsigned char count; unsigned char has_cache; - count = p->swap_map[offset]; + count = si->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; @@ -1219,7 +1389,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { - if (swap_count_continued(p, offset, count)) + if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; else count = SWAP_MAP_MAX; @@ -1229,9 +1399,9 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, usage = count | has_cache; if (usage) - WRITE_ONCE(p->swap_map[offset], usage); + WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); + WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); return usage; } @@ -1310,66 +1480,121 @@ put_out: return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *p, +static unsigned char __swap_entry_free(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; - ci = lock_cluster_or_swap_info(p, offset); - usage = __swap_entry_free_locked(p, offset, 1); - unlock_cluster_or_swap_info(p, ci); + ci = lock_cluster_or_swap_info(si, offset); + usage = __swap_entry_free_locked(si, offset, 1); + unlock_cluster_or_swap_info(si, ci); if (!usage) free_swap_slot(entry); return usage; } -static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) +static bool __swap_entries_free(struct swap_info_struct *si, + swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); + unsigned int type = swp_type(entry); + struct swap_cluster_info *ci; + bool has_cache = false; unsigned char count; + int i; + + if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + goto fallback; + /* cross into another cluster */ + if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) + goto fallback; + + ci = lock_cluster_or_swap_info(si, offset); + if (!swap_is_last_map(si, offset, nr, &has_cache)) { + unlock_cluster_or_swap_info(si, ci); + goto fallback; + } + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); + unlock_cluster_or_swap_info(si, ci); + + if (!has_cache) { + for (i = 0; i < nr; i++) + zswap_invalidate(swp_entry(si->type, offset + i)); + spin_lock(&si->lock); + swap_entry_range_free(si, entry, nr); + spin_unlock(&si->lock); + } + return has_cache; + +fallback: + for (i = 0; i < nr; i++) { + if (data_race(si->swap_map[offset + i])) { + count = __swap_entry_free(si, swp_entry(type, offset + i)); + if (count == SWAP_HAS_CACHE) + has_cache = true; + } else { + WARN_ON_ONCE(1); + } + } + return has_cache; +} - ci = lock_cluster(p, offset); - count = p->swap_map[offset]; - VM_BUG_ON(count != SWAP_HAS_CACHE); - p->swap_map[offset] = 0; - dec_cluster_info_page(p, p->cluster_info, offset); +/* + * Drop the last HAS_CACHE flag of swap entries, caller have to + * ensure all entries belong to the same cgroup. + */ +static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, + unsigned int nr_pages) +{ + unsigned long offset = swp_offset(entry); + unsigned char *map = si->swap_map + offset; + unsigned char *map_end = map + nr_pages; + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + do { + VM_BUG_ON(*map != SWAP_HAS_CACHE); + *map = 0; + } while (++map < map_end); + dec_cluster_info_page(si, ci, nr_pages); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, 1); - swap_range_free(p, offset, 1); + mem_cgroup_uncharge_swap(entry, nr_pages); + swap_range_free(si, offset, nr_pages); } -static void cluster_swap_free_nr(struct swap_info_struct *sis, - unsigned long offset, int nr_pages) +static void cluster_swap_free_nr(struct swap_info_struct *si, + unsigned long offset, int nr_pages, + unsigned char usage) { struct swap_cluster_info *ci; DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; int i, nr; - ci = lock_cluster_or_swap_info(sis, offset); + ci = lock_cluster_or_swap_info(si, offset); while (nr_pages) { nr = min(BITS_PER_LONG, nr_pages); for (i = 0; i < nr; i++) { - if (!__swap_entry_free_locked(sis, offset + i, 1)) + if (!__swap_entry_free_locked(si, offset + i, usage)) bitmap_set(to_free, i, 1); } if (!bitmap_empty(to_free, BITS_PER_LONG)) { - unlock_cluster_or_swap_info(sis, ci); + unlock_cluster_or_swap_info(si, ci); for_each_set_bit(i, to_free, BITS_PER_LONG) - free_swap_slot(swp_entry(sis->type, offset + i)); + free_swap_slot(swp_entry(si->type, offset + i)); if (nr == nr_pages) return; bitmap_clear(to_free, 0, BITS_PER_LONG); - ci = lock_cluster_or_swap_info(sis, offset); + ci = lock_cluster_or_swap_info(si, offset); } offset += nr; nr_pages -= nr; } - unlock_cluster_or_swap_info(sis, ci); + unlock_cluster_or_swap_info(si, ci); } /* @@ -1388,7 +1613,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr); + cluster_swap_free_nr(sis, offset, nr, 1); offset += nr; nr_pages -= nr; } @@ -1400,12 +1625,8 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); - unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; struct swap_info_struct *si; - unsigned char *map; - unsigned int i, free_entries = 0; - unsigned char val; int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); @@ -1413,24 +1634,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) return; ci = lock_cluster_or_swap_info(si, offset); - if (size == SWAPFILE_CLUSTER) { - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) { - val = map[i]; - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); - if (val == SWAP_HAS_CACHE) - free_entries++; - } - if (free_entries == SWAPFILE_CLUSTER) { - unlock_cluster_or_swap_info(si, ci); - spin_lock(&si->lock); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); - return; - } + if (size > 1 && swap_is_has_cache(si, offset, size)) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); + swap_entry_range_free(si, entry, size); + spin_unlock(&si->lock); + return; } - for (i = 0; i < size; i++, entry.val++) { + for (int i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { unlock_cluster_or_swap_info(si, ci); free_swap_slot(entry); @@ -1470,7 +1681,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n) for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) - swap_entry_free(p, entries[i]); + swap_entry_range_free(p, entries[i], 1); prev = p; } if (p) @@ -1509,28 +1720,28 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; - struct swap_info_struct *p; + struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; - p = _swap_info_get(entry); - if (!p) + si = _swap_info_get(entry); + if (!si) return 0; offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); + ci = lock_cluster_or_swap_info(si, offset); - count = swap_count(p->swap_map[offset]); + count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; - page = vmalloc_to_page(p->swap_map + offset); + page = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); @@ -1544,7 +1755,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster_or_swap_info(p, ci); + unlock_cluster_or_swap_info(si, ci); return count; } @@ -1590,16 +1801,7 @@ static bool folio_swapped(struct folio *folio) return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } -/** - * folio_free_swap() - Free the swap space used for this folio. - * @folio: The folio to remove. - * - * If swap is getting full, or if there are no more mappings of this folio, - * then call folio_free_swap to free its swap space. - * - * Return: true if we were able to release the swap space. - */ -bool folio_free_swap(struct folio *folio) +static bool folio_swapcache_freeable(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -1607,8 +1809,6 @@ bool folio_free_swap(struct folio *folio) return false; if (folio_test_writeback(folio)) return false; - if (folio_swapped(folio)) - return false; /* * Once hibernation has begun to create its image of memory, @@ -1628,6 +1828,25 @@ bool folio_free_swap(struct folio *folio) if (pm_suspended_storage()) return false; + return true; +} + +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. + */ +bool folio_free_swap(struct folio *folio) +{ + if (!folio_swapcache_freeable(folio)) + return false; + if (folio_swapped(folio)) + return false; + delete_from_swap_cache(folio); folio_set_dirty(folio); return true; @@ -1647,11 +1866,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; - unsigned int type = swp_type(entry); struct swap_info_struct *si; bool any_only_cache = false; unsigned long offset; - unsigned char count; if (non_swap_entry(entry)) return; @@ -1666,15 +1883,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - for (offset = start_offset; offset < end_offset; offset++) { - if (data_race(si->swap_map[offset])) { - count = __swap_entry_free(si, swp_entry(type, offset)); - if (count == SWAP_HAS_CACHE) - any_only_cache = true; - } else { - WARN_ON_ONCE(1); - } - } + any_only_cache = __swap_entries_free(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1704,7 +1913,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) * to the next boundary. */ nr = __try_to_reclaim_swap(si, offset, - TTRS_UNMAPPED | TTRS_FULL); + TTRS_UNMAPPED | TTRS_FULL); if (nr == 0) nr = 1; else if (nr < 0) @@ -1979,7 +2188,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { - struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, @@ -1987,10 +2195,8 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, .pmd = pmd, }; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); - if (page) - folio = page_folio(page); } if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); @@ -2397,52 +2603,54 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *p) +static int swap_node(struct swap_info_struct *si) { struct block_device *bdev; - if (p->bdev) - bdev = p->bdev; + if (si->bdev) + bdev = si->bdev; else - bdev = p->swap_file->f_inode->i_sb->s_bdev; + bdev = si->swap_file->f_inode->i_sb->s_bdev; return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void setup_swap_info(struct swap_info_struct *p, int prio, +static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, - struct swap_cluster_info *cluster_info) + struct swap_cluster_info *cluster_info, + unsigned long *zeromap) { int i; if (prio >= 0) - p->prio = prio; + si->prio = prio; else - p->prio = --least_priority; + si->prio = --least_priority; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ - p->list.prio = -p->prio; + si->list.prio = -si->prio; for_each_node(i) { - if (p->prio >= 0) - p->avail_lists[i].prio = -p->prio; + if (si->prio >= 0) + si->avail_lists[i].prio = -si->prio; else { - if (swap_node(p) == i) - p->avail_lists[i].prio = 1; + if (swap_node(si) == i) + si->avail_lists[i].prio = 1; else - p->avail_lists[i].prio = -p->prio; + si->avail_lists[i].prio = -si->prio; } } - p->swap_map = swap_map; - p->cluster_info = cluster_info; + si->swap_map = swap_map; + si->cluster_info = cluster_info; + si->zeromap = zeromap; } -static void _enable_swap_info(struct swap_info_struct *p) +static void _enable_swap_info(struct swap_info_struct *si) { - p->flags |= SWP_WRITEOK; - atomic_long_add(p->pages, &nr_swap_pages); - total_swap_pages += p->pages; + si->flags |= SWP_WRITEOK; + atomic_long_add(si->pages, &nr_swap_pages); + total_swap_pages += si->pages; assert_spin_locked(&swap_lock); /* @@ -2455,40 +2663,41 @@ static void _enable_swap_info(struct swap_info_struct *p) * which allocates swap pages from the highest available priority * swap_info_struct. */ - plist_add(&p->list, &swap_active_head); + plist_add(&si->list, &swap_active_head); /* add to available list iff swap device is not full */ - if (p->highest_bit) - add_to_avail_list(p); + if (si->highest_bit) + add_to_avail_list(si); } -static void enable_swap_info(struct swap_info_struct *p, int prio, +static void enable_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, - struct swap_cluster_info *cluster_info) + struct swap_cluster_info *cluster_info, + unsigned long *zeromap) { spin_lock(&swap_lock); - spin_lock(&p->lock); - setup_swap_info(p, prio, swap_map, cluster_info); - spin_unlock(&p->lock); + spin_lock(&si->lock); + setup_swap_info(si, prio, swap_map, cluster_info, zeromap); + spin_unlock(&si->lock); spin_unlock(&swap_lock); /* * Finished initializing swap device, now it's safe to reference it. */ - percpu_ref_resurrect(&p->users); + percpu_ref_resurrect(&si->users); spin_lock(&swap_lock); - spin_lock(&p->lock); - _enable_swap_info(p); - spin_unlock(&p->lock); + spin_lock(&si->lock); + _enable_swap_info(si); + spin_unlock(&si->lock); spin_unlock(&swap_lock); } -static void reinsert_swap_info(struct swap_info_struct *p) +static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); - spin_lock(&p->lock); - setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); - _enable_swap_info(p); - spin_unlock(&p->lock); + spin_lock(&si->lock); + setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); + _enable_swap_info(si); + spin_unlock(&si->lock); spin_unlock(&swap_lock); } @@ -2511,6 +2720,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; @@ -2633,6 +2843,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; + zeromap = p->zeromap; + p->zeromap = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; spin_unlock(&p->lock); @@ -2645,6 +2857,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) free_percpu(p->cluster_next_cpu); p->cluster_next_cpu = NULL; vfree(swap_map); + kvfree(zeromap); kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -2874,20 +3087,20 @@ static struct swap_info_struct *alloc_swap_info(void) return p; } -static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) +static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) { if (S_ISBLK(inode->i_mode)) { - p->bdev = I_BDEV(inode); + si->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ - if (bdev_is_zoned(p->bdev)) + if (bdev_is_zoned(si->bdev)) return -EINVAL; - p->flags |= SWP_BLKDEV; + si->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { - p->bdev = inode->i_sb->s_bdev; + si->bdev = inode->i_sb->s_bdev; } return 0; @@ -2922,7 +3135,7 @@ __weak unsigned long arch_max_swapfile_size(void) return generic_max_swapfile_size(); } -static unsigned long read_swap_header(struct swap_info_struct *p, +static unsigned long read_swap_header(struct swap_info_struct *si, union swap_header *swap_header, struct inode *inode) { @@ -2953,9 +3166,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return 0; } - p->lowest_bit = 1; - p->cluster_next = 1; - p->cluster_nr = 0; + si->lowest_bit = 1; + si->cluster_next = 1; + si->cluster_nr = 0; maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; @@ -2973,7 +3186,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } - p->highest_bit = maxpages - 1; + si->highest_bit = maxpages - 1; if (!maxpages) return 0; @@ -2997,25 +3210,18 @@ static unsigned long read_swap_header(struct swap_info_struct *p, #define SWAP_CLUSTER_COLS \ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) -static int setup_swap_map_and_extents(struct swap_info_struct *p, +static int setup_swap_map_and_extents(struct swap_info_struct *si, union swap_header *swap_header, unsigned char *swap_map, - struct swap_cluster_info *cluster_info, unsigned long maxpages, sector_t *span) { - unsigned int j, k; unsigned int nr_good_pages; + unsigned long i; int nr_extents; - unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; - unsigned long i, idx; nr_good_pages = maxpages - 1; /* omit header page */ - cluster_list_init(&p->free_clusters); - cluster_list_init(&p->discard_clusters); - for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) @@ -3023,40 +3229,87 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; - /* - * Haven't marked the cluster free yet, no list - * operation involved - */ - inc_cluster_info_page(p, cluster_info, page_nr); } } - /* Haven't marked the cluster free yet, no list operation involved */ - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) - inc_cluster_info_page(p, cluster_info, i); - if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; - /* - * Not mark the cluster free yet, no list - * operation involved - */ - inc_cluster_info_page(p, cluster_info, 0); - p->max = maxpages; - p->pages = nr_good_pages; - nr_extents = setup_swap_extents(p, span); + si->max = maxpages; + si->pages = nr_good_pages; + nr_extents = setup_swap_extents(si, span); if (nr_extents < 0) return nr_extents; - nr_good_pages = p->pages; + nr_good_pages = si->pages; } if (!nr_good_pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } + return nr_extents; +} + +static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, + union swap_header *swap_header, + unsigned long maxpages) +{ + unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; + struct swap_cluster_info *cluster_info; + unsigned long i, j, k, idx; + int cpu, err = -ENOMEM; + + cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) - return nr_extents; + goto err; + + for (i = 0; i < nr_clusters; i++) + spin_lock_init(&cluster_info[i].lock); + + si->cluster_next_cpu = alloc_percpu(unsigned int); + if (!si->cluster_next_cpu) + goto err_free; + + /* Random start position to help with wear leveling */ + for_each_possible_cpu(cpu) + per_cpu(*si->cluster_next_cpu, cpu) = + get_random_u32_inclusive(1, si->highest_bit); + + si->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!si->percpu_cluster) + goto err_free; + + for_each_possible_cpu(cpu) { + struct percpu_cluster *cluster; + + cluster = per_cpu_ptr(si->percpu_cluster, cpu); + for (i = 0; i < SWAP_NR_ORDERS; i++) + cluster->next[i] = SWAP_NEXT_INVALID; + } + + /* + * Mark unusable pages as unavailable. The clusters aren't + * marked free yet, so no list operations are involved yet. + * + * See setup_swap_map_and_extents(): header page, bad pages, + * and the EOF part of the last cluster. + */ + inc_cluster_info_page(si, cluster_info, 0); + for (i = 0; i < swap_header->info.nr_badpages; i++) + inc_cluster_info_page(si, cluster_info, + swap_header->info.badpages[i]); + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) + inc_cluster_info_page(si, cluster_info, i); + + INIT_LIST_HEAD(&si->free_clusters); + INIT_LIST_HEAD(&si->full_clusters); + INIT_LIST_HEAD(&si->discard_clusters); + for (i = 0; i < SWAP_NR_ORDERS; i++) { + INIT_LIST_HEAD(&si->nonfull_clusters[i]); + INIT_LIST_HEAD(&si->frag_clusters[i]); + si->frag_cluster_nr[i] = 0; + } /* * Reduce false cache line sharing between cluster_info and @@ -3065,22 +3318,32 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = (k + col) % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { + struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; + ci = cluster_info + idx; if (idx >= nr_clusters) continue; - if (cluster_count(&cluster_info[idx])) + if (ci->count) { + ci->flags = CLUSTER_FLAG_NONFULL; + list_add_tail(&ci->list, &si->nonfull_clusters[0]); continue; - cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - cluster_list_add_tail(&p->free_clusters, cluster_info, - idx); + } + ci->flags = CLUSTER_FLAG_FREE; + list_add_tail(&ci->list, &si->free_clusters); } } - return nr_extents; + + return cluster_info; + +err_free: + kvfree(cluster_info); +err: + return ERR_PTR(err); } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { - struct swap_info_struct *p; + struct swap_info_struct *si; struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; @@ -3092,8 +3355,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + unsigned long *zeromap = NULL; struct swap_cluster_info *cluster_info = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3106,11 +3370,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!swap_avail_heads) return -ENOMEM; - p = alloc_swap_info(); - if (IS_ERR(p)) - return PTR_ERR(p); + si = alloc_swap_info(); + if (IS_ERR(si)) + return PTR_ERR(si); - INIT_WORK(&p->discard_work, swap_discard_work); + INIT_WORK(&si->discard_work, swap_discard_work); name = getname(specialfile); if (IS_ERR(name)) { @@ -3125,12 +3389,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - p->swap_file = swap_file; + si->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; - error = claim_swapfile(p, inode); + error = claim_swapfile(si, inode); if (unlikely(error)) goto bad_swap; @@ -3151,14 +3415,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; goto bad_swap_unlock_inode; } - page = read_mapping_page(mapping, 0, swap_file); - if (IS_ERR(page)) { - error = PTR_ERR(page); + folio = read_mapping_folio(mapping, 0, swap_file); + if (IS_ERR(folio)) { + error = PTR_ERR(folio); goto bad_swap_unlock_inode; } - swap_header = kmap(page); + swap_header = kmap_local_folio(folio, 0); - maxpages = read_swap_header(p, swap_header, inode); + maxpages = read_swap_header(si, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap_unlock_inode; @@ -3171,79 +3435,57 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } - if (p->bdev && bdev_stable_writes(p->bdev)) - p->flags |= SWP_STABLE_WRITES; + error = swap_cgroup_swapon(si->type, maxpages); + if (error) + goto bad_swap_unlock_inode; - if (p->bdev && bdev_synchronous(p->bdev)) - p->flags |= SWP_SYNCHRONOUS_IO; + nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, + maxpages, &span); + if (unlikely(nr_extents < 0)) { + error = nr_extents; + goto bad_swap_unlock_inode; + } - if (p->bdev && bdev_nonrot(p->bdev)) { - int cpu, i; - unsigned long ci, nr_cluster; + /* + * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might + * be above MAX_PAGE_ORDER incase of a large swap file. + */ + zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), + GFP_KERNEL | __GFP_ZERO); + if (!zeromap) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } - p->flags |= SWP_SOLIDSTATE; - p->cluster_next_cpu = alloc_percpu(unsigned int); - if (!p->cluster_next_cpu) { - error = -ENOMEM; - goto bad_swap_unlock_inode; - } - /* - * select a random position to start with to help wear leveling - * SSD - */ - for_each_possible_cpu(cpu) { - per_cpu(*p->cluster_next_cpu, cpu) = - get_random_u32_inclusive(1, p->highest_bit); - } - nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + if (si->bdev && bdev_stable_writes(si->bdev)) + si->flags |= SWP_STABLE_WRITES; - cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), - GFP_KERNEL); - if (!cluster_info) { - error = -ENOMEM; - goto bad_swap_unlock_inode; - } + if (si->bdev && bdev_synchronous(si->bdev)) + si->flags |= SWP_SYNCHRONOUS_IO; - for (ci = 0; ci < nr_cluster; ci++) - spin_lock_init(&((cluster_info + ci)->lock)); + if (si->bdev && bdev_nonrot(si->bdev)) { + si->flags |= SWP_SOLIDSTATE; - p->percpu_cluster = alloc_percpu(struct percpu_cluster); - if (!p->percpu_cluster) { - error = -ENOMEM; + cluster_info = setup_clusters(si, swap_header, maxpages); + if (IS_ERR(cluster_info)) { + error = PTR_ERR(cluster_info); + cluster_info = NULL; goto bad_swap_unlock_inode; } - for_each_possible_cpu(cpu) { - struct percpu_cluster *cluster; - - cluster = per_cpu_ptr(p->percpu_cluster, cpu); - for (i = 0; i < SWAP_NR_ORDERS; i++) - cluster->next[i] = SWAP_NEXT_INVALID; - } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } - error = swap_cgroup_swapon(p->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - - nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, - cluster_info, maxpages, &span); - if (unlikely(nr_extents < 0)) { - error = nr_extents; - goto bad_swap_unlock_inode; - } - if ((swap_flags & SWAP_FLAG_DISCARD) && - p->bdev && bdev_max_discard_sectors(p->bdev)) { + si->bdev && bdev_max_discard_sectors(si->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ - p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); /* @@ -3253,24 +3495,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) - p->flags &= ~SWP_PAGE_DISCARD; + si->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) - p->flags &= ~SWP_AREA_DISCARD; + si->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ - if (p->flags & SWP_AREA_DISCARD) { - int err = discard_swap(p); + if (si->flags & SWP_AREA_DISCARD) { + int err = discard_swap(si); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d\n", - p, err); + si, err); } } - error = init_swap_address_space(p->type, maxpages); + error = init_swap_address_space(si->type, maxpages); if (error) goto bad_swap_unlock_inode; - error = zswap_swapon(p->type, maxpages); + error = zswap_swapon(si->type, maxpages); if (error) goto free_swap_address_space; @@ -3290,15 +3532,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map, cluster_info); + enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", - K(p->pages), name->name, p->prio, nr_extents, + K(si->pages), name->name, si->prio, nr_extents, K((unsigned long long)span), - (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : "", - (p->flags & SWP_AREA_DISCARD) ? "s" : "", - (p->flags & SWP_PAGE_DISCARD) ? "c" : ""); + (si->flags & SWP_SOLIDSTATE) ? "SS" : "", + (si->flags & SWP_DISCARDABLE) ? "D" : "", + (si->flags & SWP_AREA_DISCARD) ? "s" : "", + (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); @@ -3307,34 +3549,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; free_swap_zswap: - zswap_swapoff(p->type); + zswap_swapoff(si->type); free_swap_address_space: - exit_swap_address_space(p->type); + exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: - free_percpu(p->percpu_cluster); - p->percpu_cluster = NULL; - free_percpu(p->cluster_next_cpu); - p->cluster_next_cpu = NULL; + free_percpu(si->percpu_cluster); + si->percpu_cluster = NULL; + free_percpu(si->cluster_next_cpu); + si->cluster_next_cpu = NULL; inode = NULL; - destroy_swap_extents(p); - swap_cgroup_swapoff(p->type); + destroy_swap_extents(si); + swap_cgroup_swapoff(si->type); spin_lock(&swap_lock); - p->swap_file = NULL; - p->flags = 0; + si->swap_file = NULL; + si->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); + kvfree(zeromap); kvfree(cluster_info); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) filp_close(swap_file, NULL); out: - if (page && !IS_ERR(page)) { - kunmap(page); - put_page(page); - } + if (!IS_ERR_OR_NULL(folio)) + folio_release_kmap(folio, swap_header); if (name) putname(name); if (inode) @@ -3362,7 +3603,7 @@ void si_swapinfo(struct sysinfo *val) } /* - * Verify that a swap entry is valid and increment its swap map count. + * Verify that nr swap entries are valid and increment their swap map counts. * * Returns error code in following case. * - success -> 0 @@ -3372,63 +3613,76 @@ void si_swapinfo(struct sysinfo *val) * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, unsigned char usage) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { - struct swap_info_struct *p; + struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; - int err; + int err, i; - p = swp_swap_info(entry); + si = swp_swap_info(entry); offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); - - count = p->swap_map[offset]; - - /* - * swapin_readahead() doesn't check if a swap entry is valid, so the - * swap entry could be SWAP_MAP_BAD. Check here with lock held. - */ - if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { - err = -ENOENT; - goto unlock_out; - } + VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); + VM_WARN_ON(usage == 1 && nr > 1); + ci = lock_cluster_or_swap_info(si, offset); - has_cache = count & SWAP_HAS_CACHE; - count &= ~SWAP_HAS_CACHE; err = 0; + for (i = 0; i < nr; i++) { + count = si->swap_map[offset + i]; - if (usage == SWAP_HAS_CACHE) { + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } - /* set SWAP_HAS_CACHE if there is no cache and entry is used */ - if (!has_cache && count) - has_cache = SWAP_HAS_CACHE; - else if (has_cache) /* someone else added cache */ - err = -EEXIST; - else /* no users remaining */ + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + + if (!count && !has_cache) { err = -ENOENT; + } else if (usage == SWAP_HAS_CACHE) { + if (has_cache) + err = -EEXIST; + } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { + err = -EINVAL; + } + + if (err) + goto unlock_out; + } - } else if (count || has_cache) { + for (i = 0; i < nr; i++) { + count = si->swap_map[offset + i]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + if (usage == SWAP_HAS_CACHE) + has_cache = SWAP_HAS_CACHE; + else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; - else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) - err = -EINVAL; - else if (swap_count_continued(p, offset, count)) + else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; - else + else { + /* + * Don't need to rollback changes, because if + * usage == 1, there must be nr == 1. + */ err = -ENOMEM; - } else - err = -ENOENT; /* unused swap entry */ + goto unlock_out; + } - if (!err) - WRITE_ONCE(p->swap_map[offset], count | has_cache); + WRITE_ONCE(si->swap_map[offset + i], count | has_cache); + } unlock_out: - unlock_cluster_or_swap_info(p, ci); + unlock_cluster_or_swap_info(si, ci); return err; } @@ -3436,9 +3690,9 @@ unlock_out: * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ -void swap_shmem_alloc(swp_entry_t entry) +void swap_shmem_alloc(swp_entry_t entry, int nr) { - __swap_duplicate(entry, SWAP_MAP_SHMEM); + __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); } /* @@ -3452,35 +3706,29 @@ int swap_duplicate(swp_entry_t entry) { int err = 0; - while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* - * @entry: swap entry for which we allocate swap cache. + * @entry: first swap entry from which we allocate nr swap cache. * - * Called when allocating swap cache for existing swap entry, + * Called when allocating swap cache for existing swap entries, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ -int swapcache_prepare(swp_entry_t entry) +int swapcache_prepare(swp_entry_t entry, int nr) { - return __swap_duplicate(entry, SWAP_HAS_CACHE); + return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry) +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); - unsigned char usage; - ci = lock_cluster_or_swap_info(si, offset); - usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE); - unlock_cluster_or_swap_info(si, ci); - if (!usage) - free_swap_slot(entry); + cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index acc56c75ba99..ce13c4062647 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -391,7 +391,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; @@ -1763,3 +1763,171 @@ out: VM_WARN_ON(!moved && !err); return moved ? moved : err; } + +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t flags) +{ + const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + + vm_flags_reset(vma, flags); + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) + vma_set_page_prot(vma); +} + +static void userfaultfd_set_ctx(struct vm_area_struct *vma, + struct userfaultfd_ctx *ctx, + unsigned long flags) +{ + vma_start_write(vma); + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; + userfaultfd_set_vm_flags(vma, + (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); +} + +void userfaultfd_reset_ctx(struct vm_area_struct *vma) +{ + userfaultfd_set_ctx(vma, NULL, 0); +} + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + struct vm_area_struct *ret; + + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(vma, start, end - start, false); + + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, + vma->vm_flags & ~__VM_UFFD_FLAGS, + NULL_VM_UFFD_CTX); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + if (!IS_ERR(ret)) + userfaultfd_reset_ctx(ret); + + return ret; +} + +/* Assumes mmap write lock taken, and mm_struct pinned. */ +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + unsigned long vm_flags, + unsigned long start, unsigned long end, + bool wp_async) +{ + VMA_ITERATOR(vmi, ctx->mm, start); + struct vm_area_struct *prev = vma_prev(&vmi); + unsigned long vma_end; + unsigned long new_flags; + + if (vma->vm_start < start) + prev = vma; + + for_each_vma_range(vmi, vma, end) { + cond_resched(); + + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); + BUG_ON(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + userfaultfd_set_ctx(vma, ctx, vm_flags); + + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); + +skip: + prev = vma; + start = vma->vm_end; + } + + return 0; +} + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + /* the various vma->vm_userfaultfd_ctx still points to it */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + if (vma->vm_userfaultfd_ctx.ctx == ctx) + userfaultfd_reset_ctx(vma); + } + mmap_write_unlock(mm); +} + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx) +{ + struct vm_area_struct *vma, *prev; + VMA_ITERATOR(vmi, mm, 0); + + if (!mmget_not_zero(mm)) + return; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_lock. So + * it's critical that released is set to true (above), before + * taking the mmap_lock for writing. + */ + mmap_write_lock(mm); + prev = NULL; + for_each_vma(vmi, vma) { + cond_resched(); + BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & __VM_UFFD_FLAGS)); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + + vma = userfaultfd_clear_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end); + prev = vma; + } + mmap_write_unlock(mm); + mmput(mm); +} diff --git a/mm/util.c b/mm/util.c index bd283e2132e0..4f1275023eb7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -463,7 +463,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) if (gap + pad > gap) gap += pad; - if (gap < MIN_GAP) + if (gap < MIN_GAP && MIN_GAP < MAX_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; @@ -608,6 +608,28 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + /** * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. @@ -627,32 +649,15 @@ EXPORT_SYMBOL(vm_mmap); */ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) { - gfp_t kmalloc_flags = flags; void *ret; /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - kmalloc_flags |= __GFP_NOWARN; - - if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) - kmalloc_flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - kmalloc_flags &= ~__GFP_NOFAIL; - } - - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), kmalloc_flags, node); - - /* * It doesn't really make sense to fallback to vmalloc for sub page * requests */ + ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), + kmalloc_gfp_adjust(flags, size), + node); if (ret || size <= PAGE_SIZE) return ret; @@ -715,18 +720,53 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); -void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) { - void *newp; + void *n; - if (oldsize >= newsize) - return (void *)p; - newp = kvmalloc_noprof(newsize, flags); - if (!newp) - return NULL; - memcpy(newp, p, oldsize); - kvfree(p); - return newp; + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; } EXPORT_SYMBOL(kvrealloc_noprof); diff --git a/mm/vma.c b/mm/vma.c new file mode 100644 index 000000000000..4737afcb064c --- /dev/null +++ b/mm/vma.c @@ -0,0 +1,2068 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * VMA-specific functions. + */ + +#include "vma_internal.h" +#include "vma.h" + +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) +{ + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; + + if (!mpol_equal(vmg->policy, vma_policy(vma))) + return false; + /* + * VM_SOFTDIRTY should not prevent from VMA merging, if we + * match the flags but dirty bit -- the caller should mark + * merged VMA as dirty. If dirty bit won't be excluded from + * comparison, we increase pressure on the memory system forcing + * the kernel to generate new VMAs when old one could be + * extended instead. + */ + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) + return false; + if (vma->vm_file != vmg->file) + return false; + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) + return false; + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) + return false; + return true; +} + +static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, + struct anon_vma *anon_vma2, struct vm_area_struct *vma) +{ + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + if ((!anon_vma1 || !anon_vma2) && (!vma || + list_is_singular(&vma->anon_vma_chain))) + return true; + return anon_vma1 == anon_vma2; +} + +/* Are the anon_vma's belonging to each VMA compatible with one another? */ +static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, + struct vm_area_struct *vma2) +{ + return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL); +} + +/* + * init_multi_vma_prep() - Initializer for struct vma_prepare + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + * @next: The next vma if it is to be adjusted + * @remove: The first vma to be removed + * @remove2: The second vma to be removed + */ +static void init_multi_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma, + struct vm_area_struct *next, + struct vm_area_struct *remove, + struct vm_area_struct *remove2) +{ + memset(vp, 0, sizeof(struct vma_prepare)); + vp->vma = vma; + vp->anon_vma = vma->anon_vma; + vp->remove = remove; + vp->remove2 = remove2; + vp->adj_next = next; + if (!vp->anon_vma && next) + vp->anon_vma = next->anon_vma; + + vp->file = vma->vm_file; + if (vp->file) + vp->mapping = vma->vm_file->f_mapping; + +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * in front of (at a lower virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We don't check here for the merged mmap wrapping around the end of pagecache + * indices (16TB on ia32) because do_mmap() does not permit mmap's which + * wrap, nor mmaps which cover the final page at index -1UL. + * + * We assume the vma may be removed as part of the merge. + */ +static bool can_vma_merge_before(struct vma_merge_struct *vmg) +{ + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); + + if (is_mergeable_vma(vmg, /* merge_next = */ true) && + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) { + if (vmg->next->vm_pgoff == vmg->pgoff + pglen) + return true; + } + + return false; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * beyond (at a higher virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We assume that vma is not removed as part of the merge. + */ +static bool can_vma_merge_after(struct vma_merge_struct *vmg) +{ + if (is_mergeable_vma(vmg, /* merge_next = */ false) && + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) + return true; + } + return false; +} + +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma_is_shared_maywrite(vma)) + mapping_allow_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * Requires inode->i_mapping->i_mmap_rwsem + */ +static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma_is_shared_maywrite(vma)) + mapping_unmap_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * vma_prepare() - Helper function for handling locking VMAs prior to altering + * @vp: The initialized vma_prepare struct + */ +static void vma_prepare(struct vma_prepare *vp) +{ + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + + if (vp->adj_next) + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, + vp->adj_next->vm_end); + + i_mmap_lock_write(vp->mapping); + if (vp->insert && vp->insert->vm_file) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(vp->insert, + vp->insert->vm_file->f_mapping); + } + } + + if (vp->anon_vma) { + anon_vma_lock_write(vp->anon_vma); + anon_vma_interval_tree_pre_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_pre_update_vma(vp->adj_next); + } + + if (vp->file) { + flush_dcache_mmap_lock(vp->mapping); + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); + if (vp->adj_next) + vma_interval_tree_remove(vp->adj_next, + &vp->mapping->i_mmap); + } + +} + +/* + * vma_complete- Helper function for handling the unlocking after altering VMAs, + * or for inserting a VMA. + * + * @vp: The vma_prepare struct + * @vmi: The vma iterator + * @mm: The mm_struct + */ +static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, + struct mm_struct *mm) +{ + if (vp->file) { + if (vp->adj_next) + vma_interval_tree_insert(vp->adj_next, + &vp->mapping->i_mmap); + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); + flush_dcache_mmap_unlock(vp->mapping); + } + + if (vp->remove && vp->file) { + __remove_shared_vm_struct(vp->remove, vp->mapping); + if (vp->remove2) + __remove_shared_vm_struct(vp->remove2, vp->mapping); + } else if (vp->insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + vma_iter_store(vmi, vp->insert); + mm->map_count++; + } + + if (vp->anon_vma) { + anon_vma_interval_tree_post_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_post_update_vma(vp->adj_next); + anon_vma_unlock_write(vp->anon_vma); + } + + if (vp->file) { + i_mmap_unlock_write(vp->mapping); + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); + } + + if (vp->remove) { +again: + vma_mark_detached(vp->remove, true); + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); + fput(vp->file); + } + if (vp->remove->anon_vma) + anon_vma_merge(vp->vma, vp->remove); + mm->map_count--; + mpol_put(vma_policy(vp->remove)); + if (!vp->remove2) + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); + vm_area_free(vp->remove); + + /* + * In mprotect's case 6 (see comments on vma_merge), + * we are removing both mid and next vmas + */ + if (vp->remove2) { + vp->remove = vp->remove2; + vp->remove2 = NULL; + goto again; + } + } + if (vp->insert && vp->file) + uprobe_mmap(vp->insert); +} + +/* + * init_vma_prep() - Initializer wrapper for vma_prepare struct + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + */ +static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) +{ + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); +} + +/* + * Can the proposed VMA be merged with the left (previous) VMA taking into + * account the start position of the proposed range. + */ +static bool can_vma_merge_left(struct vma_merge_struct *vmg) + +{ + return vmg->prev && vmg->prev->vm_end == vmg->start && + can_vma_merge_after(vmg); +} + +/* + * Can the proposed VMA be merged with the right (next) VMA taking into + * account the end position of the proposed range. + * + * In addition, if we can merge with the left VMA, ensure that left and right + * anon_vma's are also compatible. + */ +static bool can_vma_merge_right(struct vma_merge_struct *vmg, + bool can_merge_left) +{ + if (!vmg->next || vmg->end != vmg->next->vm_start || + !can_vma_merge_before(vmg)) + return false; + + if (!can_merge_left) + return true; + + /* + * If we can merge with prev (left) and next (right), indicating that + * each VMA's anon_vma is compatible with the proposed anon_vma, this + * does not mean prev and next are compatible with EACH OTHER. + * + * We therefore check this in addition to mergeability to either side. + */ + return are_anon_vmas_compatible(vmg->prev, vmg->next); +} + +/* + * Close a vm structure and free it. + */ +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed) +{ + might_sleep(); + if (!closed && vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); + if (unreachable) + __vm_area_free(vma); + else + vm_area_free(vma); +} + +/* + * Get rid of page table information in the indicated region. + * + * Called with the mm semaphore held. + */ +void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct vm_area_struct *next) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + update_hiwater_rss(mm); + unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, + /* mm_wr_locked = */ true); + mas_set(mas, vma->vm_end); + free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : USER_PGTABLES_CEILING, + /* mm_wr_locked = */ true); + tlb_finish_mmu(&tlb); +} + +/* + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it + * has already been checked or doesn't make sense to fail. + * VMA Iterator will point to the original VMA. + */ +static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + struct vma_prepare vp; + struct vm_area_struct *new; + int err; + + WARN_ON(vma->vm_start >= addr); + WARN_ON(vma->vm_end <= addr); + + if (vma->vm_ops && vma->vm_ops->may_split) { + err = vma->vm_ops->may_split(vma, addr); + if (err) + return err; + } + + new = vm_area_dup(vma); + if (!new) + return -ENOMEM; + + if (new_below) { + new->vm_end = addr; + } else { + new->vm_start = addr; + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + } + + err = -ENOMEM; + vma_iter_config(vmi, new->vm_start, new->vm_end); + if (vma_iter_prealloc(vmi, new)) + goto out_free_vma; + + err = vma_dup_policy(vma, new); + if (err) + goto out_free_vmi; + + err = anon_vma_clone(new, vma); + if (err) + goto out_free_mpol; + + if (new->vm_file) + get_file(new->vm_file); + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + vma_start_write(vma); + vma_start_write(new); + + init_vma_prep(&vp, vma); + vp.insert = new; + vma_prepare(&vp); + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + + if (new_below) { + vma->vm_start = addr; + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; + } else { + vma->vm_end = addr; + } + + /* vma_complete stores the new vma */ + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + + /* Success. */ + if (new_below) + vma_next(vmi); + else + vma_prev(vmi); + + return 0; + +out_free_mpol: + mpol_put(vma_policy(new)); +out_free_vmi: + vma_iter_free(vmi); +out_free_vma: + vm_area_free(new); + return err; +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the tail. + */ +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + if (vma->vm_mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + return __split_vma(vmi, vma, addr, new_below); +} + +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_lock and by + * the root anon_vma's mutex. + */ +void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +/* + * dup_anon_vma() - Helper function to duplicate anon_vma + * @dst: The destination VMA + * @src: The source VMA + * @dup: Pointer to the destination VMA when successful. + * + * Returns: 0 on success. + */ +static int dup_anon_vma(struct vm_area_struct *dst, + struct vm_area_struct *src, struct vm_area_struct **dup) +{ + /* + * Easily overlooked: when mprotect shifts the boundary, make sure the + * expanding vma has anon_vma set if the shrinking vma had, to cover any + * anon pages imported. + */ + if (src->anon_vma && !dst->anon_vma) { + int ret; + + vma_assert_write_locked(dst); + dst->anon_vma = src->anon_vma; + ret = anon_vma_clone(dst, src); + if (ret) + return ret; + + *dup = dst; + } + + return 0; +} + +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE +void validate_mm(struct mm_struct *mm) +{ + int bug = 0; + int i = 0; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + mt_validate(&mm->mm_mt); + for_each_vma(vmi, vma) { +#ifdef CONFIG_DEBUG_VM_RB + struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; +#endif + unsigned long vmi_start, vmi_end; + bool warn = 0; + + vmi_start = vma_iter_addr(&vmi); + vmi_end = vma_iter_end(&vmi); + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) + warn = 1; + + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) + warn = 1; + + if (warn) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma); + pr_emerg("tree range: %px start %lx end %lx\n", vma, + vmi_start, vmi_end - 1); + vma_iter_dump_tree(&vmi); + } + +#ifdef CONFIG_DEBUG_VM_RB + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } +#endif + i++; + } + if (i != mm->map_count) { + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); + bug = 1; + } + VM_BUG_ON_MM(bug, mm); +} +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ + +/* Actually perform the VMA merge operation. */ +static int commit_merge(struct vma_merge_struct *vmg, + struct vm_area_struct *adjust, + struct vm_area_struct *remove, + struct vm_area_struct *remove2, + long adj_start, + bool expanded) +{ + struct vma_prepare vp; + + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2); + + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + if (expanded) { + /* Note: vma iterator must be pointing to 'start'. */ + vma_iter_config(vmg->vmi, vmg->start, vmg->end); + } else { + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start, + adjust->vm_end); + } + + if (vma_iter_prealloc(vmg->vmi, vmg->vma)) + return -ENOMEM; + + vma_prepare(&vp); + vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start); + vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff); + + if (expanded) + vma_iter_store(vmg->vmi, vmg->vma); + + if (adj_start) { + adjust->vm_start += adj_start; + adjust->vm_pgoff += PHYS_PFN(adj_start); + if (adj_start < 0) { + WARN_ON(expanded); + vma_iter_store(vmg->vmi, adjust); + } + } + + vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm); + + return 0; +} + +/* We can only remove VMAs when merging if they do not have a close hook. */ +static bool can_merge_remove_vma(struct vm_area_struct *vma) +{ + return !vma->vm_ops || !vma->vm_ops->close; +} + +/* + * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its + * attributes modified. + * + * @vmg: Describes the modifications being made to a VMA and associated + * metadata. + * + * When the attributes of a range within a VMA change, then it might be possible + * for immediately adjacent VMAs to be merged into that VMA due to having + * identical properties. + * + * This function checks for the existence of any such mergeable VMAs and updates + * the maple tree describing the @vmg->vma->vm_mm address space to account for + * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge. + * + * As part of this operation, if a merge occurs, the @vmg object will have its + * vma, start, end, and pgoff fields modified to execute the merge. Subsequent + * calls to this function should reset these fields. + * + * Returns: The merged VMA if merge succeeds, or NULL otherwise. + * + * ASSUMPTIONS: + * - The caller must assign the VMA to be modifed to @vmg->vma. + * - The caller must have set @vmg->prev to the previous VMA, if there is one. + * - The caller must not set @vmg->next, as we determine this. + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). + */ +static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *prev = vmg->prev; + struct vm_area_struct *next, *res; + struct vm_area_struct *anon_dup = NULL; + struct vm_area_struct *adjust = NULL; + unsigned long start = vmg->start; + unsigned long end = vmg->end; + bool left_side = vma && start == vma->vm_start; + bool right_side = vma && end == vma->vm_end; + int err = 0; + long adj_start = 0; + bool merge_will_delete_vma, merge_will_delete_next; + bool merge_left, merge_right, merge_both; + bool expanded; + + mmap_assert_write_locked(vmg->mm); + VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */ + VM_WARN_ON(vmg->next); /* We set this. */ + VM_WARN_ON(prev && start <= prev->vm_start); + VM_WARN_ON(start >= end); + /* + * If vma == prev, then we are offset into a VMA. Otherwise, if we are + * not, we must span a portion of the VMA. + */ + VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) || + vmg->end > vma->vm_end)); + /* The vmi must be positioned within vmg->vma. */ + VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && + vma_iter_addr(vmg->vmi) < vma->vm_end)); + + vmg->state = VMA_MERGE_NOMERGE; + + /* + * If a special mapping or if the range being modified is neither at the + * furthermost left or right side of the VMA, then we have no chance of + * merging and should abort. + */ + if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) + return NULL; + + if (left_side) + merge_left = can_vma_merge_left(vmg); + else + merge_left = false; + + if (right_side) { + next = vmg->next = vma_iter_next_range(vmg->vmi); + vma_iter_prev_range(vmg->vmi); + + merge_right = can_vma_merge_right(vmg, merge_left); + } else { + merge_right = false; + next = NULL; + } + + if (merge_left) /* If merging prev, position iterator there. */ + vma_prev(vmg->vmi); + else if (!merge_right) /* If we have nothing to merge, abort. */ + return NULL; + + merge_both = merge_left && merge_right; + /* If we span the entire VMA, a merge implies it will be deleted. */ + merge_will_delete_vma = left_side && right_side; + + /* + * If we need to remove vma in its entirety but are unable to do so, + * we have no sensible recourse but to abort the merge. + */ + if (merge_will_delete_vma && !can_merge_remove_vma(vma)) + return NULL; + + /* + * If we merge both VMAs, then next is also deleted. This implies + * merge_will_delete_vma also. + */ + merge_will_delete_next = merge_both; + + /* + * If we cannot delete next, then we can reduce the operation to merging + * prev and vma (thereby deleting vma). + */ + if (merge_will_delete_next && !can_merge_remove_vma(next)) { + merge_will_delete_next = false; + merge_right = false; + merge_both = false; + } + + /* No matter what happens, we will be adjusting vma. */ + vma_start_write(vma); + + if (merge_left) + vma_start_write(prev); + + if (merge_right) + vma_start_write(next); + + if (merge_both) { + /* + * |<----->| + * |-------*********-------| + * prev vma next + * extend delete delete + */ + + vmg->vma = prev; + vmg->start = prev->vm_start; + vmg->end = next->vm_end; + vmg->pgoff = prev->vm_pgoff; + + /* + * We already ensured anon_vma compatibility above, so now it's + * simply a case of, if prev has no anon_vma object, which of + * next or vma contains the anon_vma we must duplicate. + */ + err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup); + } else if (merge_left) { + /* + * |<----->| OR + * |<--------->| + * |-------************* + * prev vma + * extend shrink/delete + */ + + vmg->vma = prev; + vmg->start = prev->vm_start; + vmg->pgoff = prev->vm_pgoff; + + if (!merge_will_delete_vma) { + adjust = vma; + adj_start = vmg->end - vma->vm_start; + } + + err = dup_anon_vma(prev, vma, &anon_dup); + } else { /* merge_right */ + /* + * |<----->| OR + * |<--------->| + * *************-------| + * vma next + * shrink/delete extend + */ + + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); + + VM_WARN_ON(!merge_right); + /* If we are offset into a VMA, then prev must be vma. */ + VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev); + + if (merge_will_delete_vma) { + vmg->vma = next; + vmg->end = next->vm_end; + vmg->pgoff = next->vm_pgoff - pglen; + } else { + /* + * We shrink vma and expand next. + * + * IMPORTANT: This is the ONLY case where the final + * merged VMA is NOT vmg->vma, but rather vmg->next. + */ + + vmg->start = vma->vm_start; + vmg->end = start; + vmg->pgoff = vma->vm_pgoff; + + adjust = next; + adj_start = -(vma->vm_end - start); + } + + err = dup_anon_vma(next, vma, &anon_dup); + } + + if (err) + goto abort; + + /* + * In nearly all cases, we expand vmg->vma. There is one exception - + * merge_right where we partially span the VMA. In this case we shrink + * the end of vmg->vma and adjust the start of vmg->next accordingly. + */ + expanded = !merge_right || merge_will_delete_vma; + + if (commit_merge(vmg, adjust, + merge_will_delete_vma ? vma : NULL, + merge_will_delete_next ? next : NULL, + adj_start, expanded)) { + if (anon_dup) + unlink_anon_vmas(anon_dup); + + vmg->state = VMA_MERGE_ERROR_NOMEM; + return NULL; + } + + res = merge_left ? prev : next; + khugepaged_enter_vma(res, vmg->flags); + + vmg->state = VMA_MERGE_SUCCESS; + return res; + +abort: + vma_iter_set(vmg->vmi, start); + vma_iter_load(vmg->vmi); + vmg->state = VMA_MERGE_ERROR_NOMEM; + return NULL; +} + +/* + * vma_merge_new_range - Attempt to merge a new VMA into address space + * + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end + * (exclusive), which we try to merge with any adjacent VMAs if possible. + * + * We are about to add a VMA to the address space starting at @vmg->start and + * ending at @vmg->end. There are three different possible scenarios: + * + * 1. There is a VMA with identical properties immediately adjacent to the + * proposed new VMA [@vmg->start, @vmg->end) either before or after it - + * EXPAND that VMA: + * + * Proposed: |-----| or |-----| + * Existing: |----| |----| + * + * 2. There are VMAs with identical properties immediately adjacent to the + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - + * EXPAND the former and REMOVE the latter: + * + * Proposed: |-----| + * Existing: |----| |----| + * + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those + * VMAs do not have identical attributes - NO MERGE POSSIBLE. + * + * In instances where we can merge, this function returns the expanded VMA which + * will have its range adjusted accordingly and the underlying maple tree also + * adjusted. + * + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer + * to the VMA we expanded. + * + * This function adjusts @vmg to provide @vmg->next if not already specified, + * and adjusts [@vmg->start, @vmg->end) to span the expanded range. + * + * ASSUMPTIONS: + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. + * - The caller must have determined that [@vmg->start, @vmg->end) is empty, + other than VMAs that will be unmapped should the operation succeed. + * - The caller must have specified the previous vma in @vmg->prev. + * - The caller must have specified the next vma in @vmg->next. + * - The caller must have positioned the vmi at or before the gap. + */ +struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *prev = vmg->prev; + struct vm_area_struct *next = vmg->next; + unsigned long start = vmg->start; + unsigned long end = vmg->end; + pgoff_t pgoff = vmg->pgoff; + pgoff_t pglen = PHYS_PFN(end - start); + bool can_merge_left, can_merge_right; + + mmap_assert_write_locked(vmg->mm); + VM_WARN_ON(vmg->vma); + /* vmi must point at or before the gap. */ + VM_WARN_ON(vma_iter_addr(vmg->vmi) > end); + + vmg->state = VMA_MERGE_NOMERGE; + + /* Special VMAs are unmergeable, also if no prev/next. */ + if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) + return NULL; + + can_merge_left = can_vma_merge_left(vmg); + can_merge_right = can_vma_merge_right(vmg, can_merge_left); + + /* If we can merge with the next VMA, adjust vmg accordingly. */ + if (can_merge_right) { + vmg->end = next->vm_end; + vmg->vma = next; + vmg->pgoff = next->vm_pgoff - pglen; + } + + /* If we can merge with the previous VMA, adjust vmg accordingly. */ + if (can_merge_left) { + vmg->start = prev->vm_start; + vmg->vma = prev; + vmg->pgoff = prev->vm_pgoff; + + /* + * If this merge would result in removal of the next VMA but we + * are not permitted to do so, reduce the operation to merging + * prev and vma. + */ + if (can_merge_right && !can_merge_remove_vma(next)) + vmg->end = end; + + vma_prev(vmg->vmi); /* Equivalent to going to the previous range */ + } + + /* + * Now try to expand adjacent VMA(s). This takes care of removing the + * following VMA if we have VMAs on both sides. + */ + if (vmg->vma && !vma_expand(vmg)) { + khugepaged_enter_vma(vmg->vma, vmg->flags); + vmg->state = VMA_MERGE_SUCCESS; + return vmg->vma; + } + + /* If expansion failed, reset state. Allows us to retry merge later. */ + vmg->vma = NULL; + vmg->start = start; + vmg->end = end; + vmg->pgoff = pgoff; + if (vmg->vma == prev) + vma_iter_set(vmg->vmi, start); + + return NULL; +} + +/* + * vma_expand - Expand an existing VMA + * + * @vmg: Describes a VMA expansion operation. + * + * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. + * Will expand over vmg->next if it's different from vmg->vma and vmg->end == + * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with + * vmg->next needs to be handled by the caller. + * + * Returns: 0 on success. + * + * ASSUMPTIONS: + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. + * - The caller must have set @vmg->vma and @vmg->next. + */ +int vma_expand(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *anon_dup = NULL; + bool remove_next = false; + struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *next = vmg->next; + + mmap_assert_write_locked(vmg->mm); + + vma_start_write(vma); + if (next && (vma != next) && (vmg->end == next->vm_end)) { + int ret; + + remove_next = true; + /* This should already have been checked by this point. */ + VM_WARN_ON(!can_merge_remove_vma(next)); + vma_start_write(next); + ret = dup_anon_vma(vma, next, &anon_dup); + if (ret) + return ret; + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_WARN_ON(next && !remove_next && + next != vma && vmg->end > next->vm_start); + /* Only handles expanding */ + VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end); + + if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true)) + goto nomem; + + return 0; + +nomem: + vmg->state = VMA_MERGE_ERROR_NOMEM; + if (anon_dup) + unlink_anon_vmas(anon_dup); + return -ENOMEM; +} + +/* + * vma_shrink() - Reduce an existing VMAs memory area + * @vmi: The vma iterator + * @vma: The VMA to modify + * @start: The new start + * @end: The new end + * + * Returns: 0 on success, -ENOMEM otherwise + */ +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff) +{ + struct vma_prepare vp; + + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); + + if (vma->vm_start < start) + vma_iter_config(vmi, vma->vm_start, start); + else + vma_iter_config(vmi, end, vma->vm_end); + + if (vma_iter_prealloc(vmi, NULL)) + return -ENOMEM; + + vma_start_write(vma); + + init_vma_prep(&vp, vma); + vma_prepare(&vp); + vma_adjust_trans_huge(vma, start, end, 0); + + vma_iter_clear(vmi); + vma_set_range(vma, start, end, pgoff); + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); + return 0; +} + +static inline void vms_clear_ptes(struct vma_munmap_struct *vms, + struct ma_state *mas_detach, bool mm_wr_locked) +{ + struct mmu_gather tlb; + + if (!vms->clear_ptes) /* Nothing to do */ + return; + + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + mas_set(mas_detach, 1); + lru_add_drain(); + tlb_gather_mmu(&tlb, vms->vma->vm_mm); + update_hiwater_rss(vms->vma->vm_mm); + unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, + vms->vma_count, mm_wr_locked); + + mas_set(mas_detach, 1); + /* start and end may be different if there is no prev or next vma. */ + free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, + vms->unmap_end, mm_wr_locked); + tlb_finish_mmu(&tlb); + vms->clear_ptes = false; +} + +void vms_clean_up_area(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct vm_area_struct *vma; + + if (!vms->nr_pages) + return; + + vms_clear_ptes(vms, mas_detach, true); + mas_set(mas_detach, 0); + mas_for_each(mas_detach, vma, ULONG_MAX) + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + vms->closed_vm_ops = true; +} + +/* + * vms_complete_munmap_vmas() - Finish the munmap() operation + * @vms: The vma munmap struct + * @mas_detach: The maple state of the detached vmas + * + * This updates the mm_struct, unmaps the region, frees the resources + * used for the munmap() and may downgrade the lock - if requested. Everything + * needed to be done once the vma maple tree is updated. + */ +void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + + mm = current->mm; + mm->map_count -= vms->vma_count; + mm->locked_vm -= vms->locked_vm; + if (vms->unlock) + mmap_write_downgrade(mm); + + if (!vms->nr_pages) + return; + + vms_clear_ptes(vms, mas_detach, !vms->unlock); + /* Update high watermark before we lower total_vm */ + update_hiwater_vm(mm); + /* Stat accounting */ + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); + /* Paranoid bookkeeping */ + VM_WARN_ON(vms->exec_vm > mm->exec_vm); + VM_WARN_ON(vms->stack_vm > mm->stack_vm); + VM_WARN_ON(vms->data_vm > mm->data_vm); + mm->exec_vm -= vms->exec_vm; + mm->stack_vm -= vms->stack_vm; + mm->data_vm -= vms->data_vm; + + /* Remove and clean up vmas */ + mas_set(mas_detach, 0); + mas_for_each(mas_detach, vma, ULONG_MAX) + remove_vma(vma, /* = */ false, vms->closed_vm_ops); + + vm_unacct_memory(vms->nr_accounted); + validate_mm(mm); + if (vms->unlock) + mmap_read_unlock(mm); + + __mt_destroy(mas_detach->tree); +} + +/* + * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree + * for removal at a later date. Handles splitting first and last if necessary + * and marking the vmas as isolated. + * + * @vms: The vma munmap struct + * @mas_detach: The maple state tracking the detached tree + * + * Return: 0 on success, error otherwise + */ +int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct vm_area_struct *next = NULL; + int error; + + /* + * If we need to split any vma, do it now to save pain later. + * Does it split the first one? + */ + if (vms->start > vms->vma->vm_start) { + + /* + * Make sure that map_count on return from munmap() will + * not exceed its limit; but let map_count go just above + * its limit temporarily, to help free resources as expected. + */ + if (vms->end < vms->vma->vm_end && + vms->vma->vm_mm->map_count >= sysctl_max_map_count) { + error = -ENOMEM; + goto map_count_exceeded; + } + + /* Don't bother splitting the VMA if we can't unmap it anyway */ + if (!can_modify_vma(vms->vma)) { + error = -EPERM; + goto start_split_failed; + } + + error = __split_vma(vms->vmi, vms->vma, vms->start, 1); + if (error) + goto start_split_failed; + } + vms->prev = vma_prev(vms->vmi); + if (vms->prev) + vms->unmap_start = vms->prev->vm_end; + + /* + * Detach a range of VMAs from the mm. Using next as a temp variable as + * it is always overwritten. + */ + for_each_vma_range(*(vms->vmi), next, vms->end) { + long nrpages; + + if (!can_modify_vma(next)) { + error = -EPERM; + goto modify_vma_failed; + } + /* Does it split the end? */ + if (next->vm_end > vms->end) { + error = __split_vma(vms->vmi, next, vms->end, 0); + if (error) + goto end_split_failed; + } + vma_start_write(next); + mas_set(mas_detach, vms->vma_count++); + error = mas_store_gfp(mas_detach, next, GFP_KERNEL); + if (error) + goto munmap_gather_failed; + + vma_mark_detached(next, true); + nrpages = vma_pages(next); + + vms->nr_pages += nrpages; + if (next->vm_flags & VM_LOCKED) + vms->locked_vm += nrpages; + + if (next->vm_flags & VM_ACCOUNT) + vms->nr_accounted += nrpages; + + if (is_exec_mapping(next->vm_flags)) + vms->exec_vm += nrpages; + else if (is_stack_mapping(next->vm_flags)) + vms->stack_vm += nrpages; + else if (is_data_mapping(next->vm_flags)) + vms->data_vm += nrpages; + + if (unlikely(vms->uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain split, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + error = userfaultfd_unmap_prep(next, vms->start, + vms->end, vms->uf); + if (error) + goto userfaultfd_error; + } +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + BUG_ON(next->vm_start < vms->start); + BUG_ON(next->vm_start > vms->end); +#endif + } + + vms->next = vma_next(vms->vmi); + if (vms->next) + vms->unmap_end = vms->next->vm_start; + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + /* Make sure no VMAs are about to be lost. */ + { + MA_STATE(test, mas_detach->tree, 0, 0); + struct vm_area_struct *vma_mas, *vma_test; + int test_count = 0; + + vma_iter_set(vms->vmi, vms->start); + rcu_read_lock(); + vma_test = mas_find(&test, vms->vma_count - 1); + for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { + BUG_ON(vma_mas != vma_test); + test_count++; + vma_test = mas_next(&test, vms->vma_count - 1); + } + rcu_read_unlock(); + BUG_ON(vms->vma_count != test_count); + } +#endif + + while (vma_iter_addr(vms->vmi) > vms->start) + vma_iter_prev_range(vms->vmi); + + vms->clear_ptes = true; + return 0; + +userfaultfd_error: +munmap_gather_failed: +end_split_failed: +modify_vma_failed: + reattach_vmas(mas_detach); +start_split_failed: +map_count_exceeded: + return error; +} + +/* + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. + * @vmi: The vma iterator + * @vma: The starting vm_area_struct + * @mm: The mm_struct + * @start: The aligned start address to munmap. + * @end: The aligned end address to munmap. + * @uf: The userfaultfd list_head + * @unlock: Set to true to drop the mmap_lock. unlocking only happens on + * success. + * + * Return: 0 on success and drops the lock if so directed, error and leaves the + * lock held otherwise. + */ +int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, unsigned long end, + struct list_head *uf, bool unlock) +{ + struct maple_tree mt_detach; + MA_STATE(mas_detach, &mt_detach, 0, 0); + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); + mt_on_stack(mt_detach); + struct vma_munmap_struct vms; + int error; + + init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); + error = vms_gather_munmap_vmas(&vms, &mas_detach); + if (error) + goto gather_failed; + + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); + if (error) + goto clear_tree_failed; + + /* Point of no return */ + vms_complete_munmap_vmas(&vms, &mas_detach); + return 0; + +clear_tree_failed: + reattach_vmas(&mas_detach); +gather_failed: + validate_mm(mm); + return error; +} + +/* + * do_vmi_munmap() - munmap a given range. + * @vmi: The vma iterator + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length of the range to munmap + * @uf: The userfaultfd list_head + * @unlock: set to true if the user wants to drop the mmap_lock on success + * + * This function takes a @mas that is either pointing to the previous VMA or set + * to MA_START and sets it up to remove the mapping(s). The @len will be + * aligned. + * + * Return: 0 on success and drops the lock if so directed, error and leaves the + * lock held otherwise. + */ +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool unlock) +{ + unsigned long end; + struct vm_area_struct *vma; + + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + end = start + PAGE_ALIGN(len); + if (end == start) + return -EINVAL; + + /* Find the first overlapping VMA */ + vma = vma_find(vmi, end); + if (!vma) { + if (unlock) + mmap_write_unlock(mm); + return 0; + } + + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); +} + +/* + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd + * context and anonymous VMA name within the range [start, end). + * + * As a result, we might be able to merge the newly modified VMA range with an + * adjacent VMA with identical properties. + * + * If no merge is possible and the range does not span the entirety of the VMA, + * we then need to split the VMA to accommodate the change. + * + * The function returns either the merged VMA, the original VMA if a split was + * required instead, or an error if the split failed. + */ +static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma = vmg->vma; + struct vm_area_struct *merged; + + /* First, try to merge. */ + merged = vma_merge_existing_range(vmg); + if (merged) + return merged; + + /* Split any preceding portion of the VMA. */ + if (vma->vm_start < vmg->start) { + int err = split_vma(vmg->vmi, vma, vmg->start, 1); + + if (err) + return ERR_PTR(err); + } + + /* Split any trailing portion of the VMA. */ + if (vma->vm_end > vmg->end) { + int err = split_vma(vmg->vmi, vma, vmg->end, 0); + + if (err) + return ERR_PTR(err); + } + + return vma; +} + +struct vm_area_struct *vma_modify_flags( + struct vma_iterator *vmi, struct vm_area_struct *prev, + struct vm_area_struct *vma, unsigned long start, unsigned long end, + unsigned long new_flags) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long new_flags, + struct anon_vma_name *new_name) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + vmg.anon_name = new_name; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.policy = new_pol; + + return vma_modify(&vmg); +} + +struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags, + struct vm_userfaultfd_ctx new_ctx) +{ + VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); + + vmg.flags = new_flags; + vmg.uffd_ctx = new_ctx; + + return vma_modify(&vmg); +} + +/* + * Expand vma by delta bytes, potentially merging with an immediately adjacent + * VMA with identical properties. + */ +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta) +{ + VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); + + vmg.next = vma_iter_next_rewind(vmi, NULL); + vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */ + + return vma_merge_new_range(&vmg); +} + +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) +{ + vb->count = 0; +} + +static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) +{ + struct address_space *mapping; + int i; + + mapping = vb->vmas[0]->vm_file->f_mapping; + i_mmap_lock_write(mapping); + for (i = 0; i < vb->count; i++) { + VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); + __remove_shared_vm_struct(vb->vmas[i], mapping); + } + i_mmap_unlock_write(mapping); + + unlink_file_vma_batch_init(vb); +} + +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, + struct vm_area_struct *vma) +{ + if (vma->vm_file == NULL) + return; + + if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || + vb->count == ARRAY_SIZE(vb->vmas)) + unlink_file_vma_batch_process(vb); + + vb->vmas[vb->count] = vma; + vb->count++; +} + +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) +{ + if (vb->count > 0) + unlink_file_vma_batch_process(vb); +} + +/* + * Unlink a file-based vm structure from its interval tree, to hide + * vma from rmap and vmtruncate before freeing its page tables. + */ +void unlink_file_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (file) { + struct address_space *mapping = file->f_mapping; + + i_mmap_lock_write(mapping); + __remove_shared_vm_struct(vma, mapping); + i_mmap_unlock_write(mapping); + } +} + +void vma_link_file(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct address_space *mapping; + + if (file) { + mapping = file->f_mapping; + i_mmap_lock_write(mapping); + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); + } +} + +int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) +{ + VMA_ITERATOR(vmi, mm, 0); + + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + return -ENOMEM; + + vma_start_write(vma); + vma_iter_store(&vmi, vma); + vma_link_file(vma); + mm->map_count++; + validate_mm(mm); + return 0; +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) +{ + struct vm_area_struct *vma = *vmap; + unsigned long vma_start = vma->vm_start; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma; + bool faulted_in_anon_vma = true; + VMA_ITERATOR(vmi, mm, addr); + VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); + + /* + * If anonymous vma has not yet been faulted, update new pgoff + * to match new location, to increase its chance of merging. + */ + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { + pgoff = addr >> PAGE_SHIFT; + faulted_in_anon_vma = false; + } + + new_vma = find_vma_prev(mm, addr, &vmg.prev); + if (new_vma && new_vma->vm_start < addr + len) + return NULL; /* should never get here */ + + vmg.vma = NULL; /* New VMA range. */ + vmg.pgoff = pgoff; + vmg.next = vma_iter_next_rewind(&vmi, NULL); + new_vma = vma_merge_new_range(&vmg); + + if (new_vma) { + /* + * Source vma may have been merged into new_vma + */ + if (unlikely(vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end)) { + /* + * The only way we can get a vma_merge with + * self during an mremap is if the vma hasn't + * been faulted in yet and we were allowed to + * reset the dst vma->vm_pgoff to the + * destination address of the mremap to allow + * the merge to happen. mremap must change the + * vm_pgoff linearity between src and dst vmas + * (in turn preventing a vma_merge) to be + * safe. It is only safe to keep the vm_pgoff + * linear if there are no pages mapped yet. + */ + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); + *vmap = vma = new_vma; + } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); + } else { + new_vma = vm_area_dup(vma); + if (!new_vma) + goto out; + vma_set_range(new_vma, addr, addr + len, pgoff); + if (vma_dup_policy(vma, new_vma)) + goto out_free_vma; + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + if (vma_link(mm, new_vma)) + goto out_vma_link; + *need_rmap_locks = false; + } + return new_vma; + +out_vma_link: + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); + + if (new_vma->vm_file) + fput(new_vma->vm_file); + + unlink_anon_vmas(new_vma); +out_free_mempol: + mpol_put(vma_policy(new_vma)); +out_free_vma: + vm_area_free(new_vma); +out: + return NULL; +} + +/* + * Rough compatibility check to quickly see if it's even worth looking + * at sharing an anon_vma. + * + * They need to have the same vm_file, and the flags can only differ + * in things that mprotect may change. + * + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that + * we can merge the two vma's. For example, we refuse to merge a vma if + * there is a vm_ops->close() function, because that indicates that the + * driver is doing some kind of reference counting. But that doesn't + * really matter for the anon_vma sharing case. + */ +static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) +{ + return a->vm_end == b->vm_start && + mpol_equal(vma_policy(a), vma_policy(b)) && + a->vm_file == b->vm_file && + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); +} + +/* + * Do some basic sanity checking to see if we can re-use the anon_vma + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be + * the same as 'old', the other will be the new one that is trying + * to share the anon_vma. + * + * NOTE! This runs with mmap_lock held for reading, so it is possible that + * the anon_vma of 'old' is concurrently in the process of being set up + * by another page fault trying to merge _that_. But that's ok: if it + * is being set up, that automatically means that it will be a singleton + * acceptable for merging, so we can do all of this optimistically. But + * we do that READ_ONCE() to make sure that we never re-load the pointer. + * + * IOW: that the "list_is_singular()" test on the anon_vma_chain only + * matters for the 'stable anon_vma' case (ie the thing we want to avoid + * is to return an anon_vma that is "complex" due to having gone through + * a fork). + * + * We also make sure that the two vma's are compatible (adjacent, + * and with the same memory policies). That's all stable, even with just + * a read lock on the mmap_lock. + */ +static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, + struct vm_area_struct *a, + struct vm_area_struct *b) +{ + if (anon_vma_compatible(a, b)) { + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); + + if (anon_vma && list_is_singular(&old->anon_vma_chain)) + return anon_vma; + } + return NULL; +} + +/* + * find_mergeable_anon_vma is used by anon_vma_prepare, to check + * neighbouring vmas for a suitable anon_vma, before it goes off + * to allocate a new anon_vma. It checks because a repetitive + * sequence of mprotects and faults may otherwise lead to distinct + * anon_vmas being allocated, preventing vma merge in subsequent + * mprotect. + */ +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = NULL; + struct vm_area_struct *prev, *next; + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); + + /* Try next first. */ + next = vma_iter_load(&vmi); + if (next) { + anon_vma = reusable_anon_vma(next, vma, next); + if (anon_vma) + return anon_vma; + } + + prev = vma_prev(&vmi); + VM_BUG_ON_VMA(prev != vma, vma); + prev = vma_prev(&vmi); + /* Try prev next. */ + if (prev) + anon_vma = reusable_anon_vma(prev, prev, vma); + + /* + * We might reach here with anon_vma == NULL if we can't find + * any reusable anon_vma. + * There's no absolute need to look only at touching neighbours: + * we could search further afield for "compatible" anon_vmas. + * But it would probably just be a waste of time searching, + * or lead to too many vmas hanging off the same anon_vma. + * We're trying to allow mprotect remerging later on, + * not trying to minimize memory used for anon_vmas. + */ + return anon_vma; +} + +static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) +{ + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); +} + +static bool vma_is_shared_writable(struct vm_area_struct *vma) +{ + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == + (VM_WRITE | VM_SHARED); +} + +static bool vma_fs_can_writeback(struct vm_area_struct *vma) +{ + /* No managed pages to writeback. */ + if (vma->vm_flags & VM_PFNMAP) + return false; + + return vma->vm_file && vma->vm_file->f_mapping && + mapping_can_writeback(vma->vm_file->f_mapping); +} + +/* + * Does this VMA require the underlying folios to have their dirty state + * tracked? + */ +bool vma_needs_dirty_tracking(struct vm_area_struct *vma) +{ + /* Only shared, writable VMAs require dirty tracking. */ + if (!vma_is_shared_writable(vma)) + return false; + + /* Does the filesystem need to be notified? */ + if (vm_ops_needs_writenotify(vma->vm_ops)) + return true; + + /* + * Even if the filesystem doesn't indicate a need for writenotify, if it + * can writeback, dirty tracking is still required. + */ + return vma_fs_can_writeback(vma); +} + +/* + * Some shared mappings will want the pages marked read-only + * to track write events. If so, we'll downgrade vm_page_prot + * to the private version (using protection_map[] without the + * VM_SHARED bit). + */ +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) +{ + /* If it was private or non-writable, the write bit is already clear */ + if (!vma_is_shared_writable(vma)) + return false; + + /* The backer wishes to know when pages are first written to? */ + if (vm_ops_needs_writenotify(vma->vm_ops)) + return true; + + /* The open routine did something to the protections that pgprot_modify + * won't preserve? */ + if (pgprot_val(vm_page_prot) != + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) + return false; + + /* + * Do we need to track softdirty? hugetlb does not support softdirty + * tracking yet. + */ + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) + return true; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_wp(vma)) + return true; + + /* Can the mapping track the dirty pages? */ + return vma_fs_can_writeback(vma); +} + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); + /* + * We can safely modify head.next after taking the + * anon_vma->root->rwsem. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->root->rwsem. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->root->rb_root.rb_root.rb_node)) + BUG(); + } +} + +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_lock in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_lock until mm_drop_all_locks() returns. + * + * mmap_lock in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout. It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We take locks in following order, accordingly to comment at beginning + * of mm/rmap.c: + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for + * hugetlb mapping); + * - all vmas marked locked + * - all i_mmap_rwsem locks; + * - all anon_vma->rwseml + * + * We can take all locks within these types randomly because the VM code + * doesn't nest them and we protected from parallel mm_take_all_locks() by + * mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct anon_vma_chain *avc; + VMA_ITERATOR(vmi, mm, 0); + + mmap_assert_write_locked(mm); + + mutex_lock(&mm_all_locks_mutex); + + /* + * vma_start_write() does not have a complement in mm_drop_all_locks() + * because vma_start_write() is always asymmetrical; it marks a VMA as + * being written to until mmap_write_unlock() or mmap_write_downgrade() + * is reached. + */ + for_each_vma(vmi, vma) { + if (signal_pending(current)) + goto out_unlock; + vma_start_write(vma); + } + + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + !is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_lock_anon_vma(mm, avc->anon_vma); + } + + return 0; + +out_unlock: + mm_drop_all_locks(mm); + return -EINTR; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->rb_root will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->root->rwsem. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->root->rb_root.rb_root.rb_node)) + BUG(); + anon_vma_unlock_write(anon_vma); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + i_mmap_unlock_write(mapping); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_lock cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct anon_vma_chain *avc; + VMA_ITERATOR(vmi, mm, 0); + + mmap_assert_write_locked(mm); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + for_each_vma(vmi, vma) { + if (vma->anon_vma) + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_unlock_anon_vma(avc->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} diff --git a/mm/vma.h b/mm/vma.h new file mode 100644 index 000000000000..819f994cf727 --- /dev/null +++ b/mm/vma.h @@ -0,0 +1,558 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * vma.h + * + * Core VMA manipulation API implemented in vma.c. + */ +#ifndef __MM_VMA_H +#define __MM_VMA_H + +/* + * VMA lock generalization + */ +struct vma_prepare { + struct vm_area_struct *vma; + struct vm_area_struct *adj_next; + struct file *file; + struct address_space *mapping; + struct anon_vma *anon_vma; + struct vm_area_struct *insert; + struct vm_area_struct *remove; + struct vm_area_struct *remove2; +}; + +struct unlink_vma_file_batch { + int count; + struct vm_area_struct *vmas[8]; +}; + +/* + * vma munmap operation + */ +struct vma_munmap_struct { + struct vma_iterator *vmi; + struct vm_area_struct *vma; /* The first vma to munmap */ + struct vm_area_struct *prev; /* vma before the munmap area */ + struct vm_area_struct *next; /* vma after the munmap area */ + struct list_head *uf; /* Userfaultfd list_head */ + unsigned long start; /* Aligned start addr (inclusive) */ + unsigned long end; /* Aligned end addr (exclusive) */ + unsigned long unmap_start; /* Unmap PTE start */ + unsigned long unmap_end; /* Unmap PTE end */ + int vma_count; /* Number of vmas that will be removed */ + bool unlock; /* Unlock after the munmap */ + bool clear_ptes; /* If there are outstanding PTE to be cleared */ + bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */ + /* 1 byte hole */ + unsigned long nr_pages; /* Number of pages being removed */ + unsigned long locked_vm; /* Number of locked pages */ + unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ + unsigned long exec_vm; + unsigned long stack_vm; + unsigned long data_vm; +}; + +enum vma_merge_state { + VMA_MERGE_START, + VMA_MERGE_ERROR_NOMEM, + VMA_MERGE_NOMERGE, + VMA_MERGE_SUCCESS, +}; + +/* Represents a VMA merge operation. */ +struct vma_merge_struct { + struct mm_struct *mm; + struct vma_iterator *vmi; + pgoff_t pgoff; + struct vm_area_struct *prev; + struct vm_area_struct *next; /* Modified by vma_merge(). */ + struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */ + unsigned long start; + unsigned long end; + unsigned long flags; + struct file *file; + struct anon_vma *anon_vma; + struct mempolicy *policy; + struct vm_userfaultfd_ctx uffd_ctx; + struct anon_vma_name *anon_name; + enum vma_merge_state state; +}; + +static inline bool vmg_nomem(struct vma_merge_struct *vmg) +{ + return vmg->state == VMA_MERGE_ERROR_NOMEM; +} + +/* Assumes addr >= vma->vm_start. */ +static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, + unsigned long addr) +{ + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); +} + +#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ + struct vma_merge_struct name = { \ + .mm = mm_, \ + .vmi = vmi_, \ + .start = start_, \ + .end = end_, \ + .flags = flags_, \ + .pgoff = pgoff_, \ + .state = VMA_MERGE_START, \ + } + +#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ + struct vma_merge_struct name = { \ + .mm = vma_->vm_mm, \ + .vmi = vmi_, \ + .prev = prev_, \ + .next = NULL, \ + .vma = vma_, \ + .start = start_, \ + .end = end_, \ + .flags = vma_->vm_flags, \ + .pgoff = vma_pgoff_offset(vma_, start_), \ + .file = vma_->vm_file, \ + .anon_vma = vma_->anon_vma, \ + .policy = vma_policy(vma_), \ + .uffd_ctx = vma_->vm_userfaultfd_ctx, \ + .anon_name = anon_vma_name(vma_), \ + .state = VMA_MERGE_START, \ + } + +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE +void validate_mm(struct mm_struct *mm); +#else +#define validate_mm(mm) do { } while (0) +#endif + +/* Required for expand_downwards(). */ +void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma); + +/* Required for expand_downwards(). */ +void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma); + +int vma_expand(struct vma_merge_struct *vmg); +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); + +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, + struct vm_area_struct *vma, gfp_t gfp) + +{ + if (vmi->mas.status != ma_start && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); + mas_store_gfp(&vmi->mas, vma, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +#ifdef CONFIG_MMU +/* + * init_vma_munmap() - Initializer wrapper for vma_munmap_struct + * @vms: The vma munmap struct + * @vmi: The vma iterator + * @vma: The first vm_area_struct to munmap + * @start: The aligned start address to munmap + * @end: The aligned end address to munmap + * @uf: The userfaultfd list_head + * @unlock: Unlock after the operation. Only unlocked on success + */ +static inline void init_vma_munmap(struct vma_munmap_struct *vms, + struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, struct list_head *uf, + bool unlock) +{ + vms->vmi = vmi; + vms->vma = vma; + if (vma) { + vms->start = start; + vms->end = end; + } else { + vms->start = vms->end = 0; + } + vms->unlock = unlock; + vms->uf = uf; + vms->vma_count = 0; + vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; + vms->exec_vm = vms->stack_vm = vms->data_vm = 0; + vms->unmap_start = FIRST_USER_ADDRESS; + vms->unmap_end = USER_PGTABLES_CEILING; + vms->clear_ptes = false; + vms->closed_vm_ops = false; +} +#endif + +int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach); + +void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach); + +void vms_clean_up_area(struct vma_munmap_struct *vms, + struct ma_state *mas_detach); + +/* + * reattach_vmas() - Undo any munmap work and free resources + * @mas_detach: The maple state with the detached maple tree + * + * Reattach any detached vmas and free up the maple tree used to track the vmas. + */ +static inline void reattach_vmas(struct ma_state *mas_detach) +{ + struct vm_area_struct *vma; + + mas_set(mas_detach, 0); + mas_for_each(mas_detach, vma, ULONG_MAX) + vma_mark_detached(vma, false); + + __mt_destroy(mas_detach->tree); +} + +/* + * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() + * operation. + * @vms: The vma unmap structure + * @mas_detach: The maple state with the detached maple tree + * + * Reattach any detached vmas, free up the maple tree used to track the vmas. + * If that's not possible because the ptes are cleared (and vm_ops->closed() may + * have been called), then a NULL is written over the vmas and the vmas are + * removed (munmap() completed). + */ +static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct ma_state *mas = &vms->vmi->mas; + if (!vms->nr_pages) + return; + + if (vms->clear_ptes) + return reattach_vmas(mas_detach); + + /* + * Aborting cannot just call the vm_ops open() because they are often + * not symmetrical and state data has been lost. Resort to the old + * failure method of leaving a gap where the MAP_FIXED mapping failed. + */ + mas_set_range(mas, vms->start, vms->end - 1); + if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) { + pr_warn_once("%s: (%d) Unable to abort munmap() operation\n", + current->comm, current->pid); + /* Leaving vmas detached and in-tree may hamper recovery */ + reattach_vmas(mas_detach); + } else { + /* Clean up the insertion of the unfortunate gap */ + vms_complete_munmap_vmas(vms, mas_detach); + } +} + +int +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool unlock); + +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool unlock); + +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed); + +void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct vm_area_struct *next); + +/* We are about to modify the VMA's flags. */ +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags); + +/* We are about to modify the VMA's flags and/or anon_name. */ +struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long new_flags, + struct anon_vma_name *new_name); + +/* We are about to modify the VMA's memory policy. */ +struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol); + +/* We are about to modify the VMA's flags and/or uffd context. */ +struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags, + struct vm_userfaultfd_ctx new_ctx); + +struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); + +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); + +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); + +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); + +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, + struct vm_area_struct *vma); + +void unlink_file_vma(struct vm_area_struct *vma); + +void vma_link_file(struct vm_area_struct *vma); + +int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); + +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks); + +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); + +bool vma_needs_dirty_tracking(struct vm_area_struct *vma); +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); + +int mm_take_all_locks(struct mm_struct *mm); +void mm_drop_all_locks(struct mm_struct *mm); + +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) +{ + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + return vma_wants_writenotify(vma, vma->vm_page_prot); + return !!(vma->vm_flags & VM_WRITE); +} + +#ifdef CONFIG_MMU +static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +{ + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); +} +#endif + +static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, + unsigned long min) +{ + return mas_prev(&vmi->mas, min); +} + +/* + * These three helpers classifies VMAs for virtual memory accounting. + */ + +/* + * Executable code area - executable, not writable, not stack + */ +static inline bool is_exec_mapping(vm_flags_t flags) +{ + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; +} + +/* + * Stack area (including shadow stacks) + * + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: + * do_mmap() forbids all other combinations. + */ +static inline bool is_stack_mapping(vm_flags_t flags) +{ + return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); +} + +/* + * Data area - private, writable, not stack + */ +static inline bool is_data_mapping(vm_flags_t flags) +{ + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; +} + + +static inline void vma_iter_config(struct vma_iterator *vmi, + unsigned long index, unsigned long last) +{ + __mas_set_range(&vmi->mas, index, last - 1); +} + +static inline void vma_iter_reset(struct vma_iterator *vmi) +{ + mas_reset(&vmi->mas); +} + +static inline +struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) +{ + return mas_prev_range(&vmi->mas, min); +} + +static inline +struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) +{ + return mas_next_range(&vmi->mas, max); +} + +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area(&vmi->mas, min, max - 1, size); +} + +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, + unsigned long max, unsigned long size) +{ + return mas_empty_area_rev(&vmi->mas, min, max - 1, size); +} + +/* + * VMA Iterator functions shared between nommu and mmap + */ +static inline int vma_iter_prealloc(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); +} + +static inline void vma_iter_clear(struct vma_iterator *vmi) +{ + mas_store_prealloc(&vmi->mas, NULL); +} + +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) +{ + return mas_walk(&vmi->mas); +} + +/* Store a VMA with preallocated memory */ +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && + vmi->mas.index > vma->vm_start)) { + pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", + vmi->mas.index, vma->vm_start, vma->vm_start, + vma->vm_end, vmi->mas.index, vmi->mas.last); + } + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && + vmi->mas.last < vma->vm_start)) { + pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", + vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, + vmi->mas.index, vmi->mas.last); + } +#endif + + if (vmi->mas.status != ma_start && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(&vmi->mas, vma); +} + +static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) +{ + return vmi->mas.index; +} + +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) +{ + return vmi->mas.last + 1; +} + +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, + unsigned long count) +{ + return mas_expected_entries(&vmi->mas, count); +} + +static inline +struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) +{ + return mas_prev_range(&vmi->mas, 0); +} + +/* + * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or + * if no previous VMA, to index 0. + */ +static inline +struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *next = vma_next(vmi); + struct vm_area_struct *prev = vma_prev(vmi); + + /* + * Consider the case where no previous VMA exists. We advance to the + * next VMA, skipping any gap, then rewind to the start of the range. + * + * If we were to unconditionally advance to the next range we'd wind up + * at the next VMA again, so we check to ensure there is a previous VMA + * to skip over. + */ + if (prev) + vma_iter_next_range(vmi); + + if (pprev) + *pprev = prev; + + return next; +} + +#ifdef CONFIG_64BIT + +static inline bool vma_is_sealed(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_SEALED); +} + +/* + * check if a vma is sealed for modification. + * return true, if modification is allowed. + */ +static inline bool can_modify_vma(struct vm_area_struct *vma) +{ + if (unlikely(vma_is_sealed(vma))) + return false; + + return true; +} + +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); + +#else + +static inline bool can_modify_vma(struct vm_area_struct *vma) +{ + return true; +} + +static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) +{ + return true; +} + +#endif + +#endif /* __MM_VMA_H */ diff --git a/mm/vma_internal.h b/mm/vma_internal.h new file mode 100644 index 000000000000..b930ab12a587 --- /dev/null +++ b/mm/vma_internal.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * vma_internal.h + * + * Headers required by vma.c, which can be substituted accordingly when testing + * VMA functionality. + */ + +#ifndef __MM_VMA_INTERNAL_H +#define __MM_VMA_INTERNAL_H + +#include <linux/backing-dev.h> +#include <linux/bitops.h> +#include <linux/bug.h> +#include <linux/cacheflush.h> +#include <linux/err.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/huge_mm.h> +#include <linux/hugetlb_inline.h> +#include <linux/kernel.h> +#include <linux/khugepaged.h> +#include <linux/list.h> +#include <linux/maple_tree.h> +#include <linux/mempolicy.h> +#include <linux/mm.h> +#include <linux/mm_inline.h> +#include <linux/mm_types.h> +#include <linux/mman.h> +#include <linux/mmap_lock.h> +#include <linux/mmdebug.h> +#include <linux/mmu_context.h> +#include <linux/mutex.h> +#include <linux/pagemap.h> +#include <linux/pfn.h> +#include <linux/rcupdate.h> +#include <linux/rmap.h> +#include <linux/rwsem.h> +#include <linux/sched/signal.h> +#include <linux/swap.h> +#include <linux/uprobes.h> +#include <linux/userfaultfd_k.h> + +#include <asm/current.h> +#include <asm/tlb.h> + +#include "internal.h" + +#endif /* __MM_VMA_INTERNAL_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a0df1e2e155a..634162271c00 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -105,7 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; do { - if (!pte_none(ptep_get(pte))) { + if (unlikely(!pte_none(ptep_get(pte)))) { if (pfn_valid(pfn)) { page = pfn_to_page(pfn); dump_page(page, "remapping already mapped page"); @@ -1940,7 +1940,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, { vm->flags = flags; vm->addr = (void *)va->va_start; - vm->size = va->va_end - va->va_start; + vm->size = va_size(va); vm->caller = caller; va->vm = vm; } @@ -2018,7 +2018,7 @@ retry: if (vm) { vm->addr = (void *)va->va_start; - vm->size = va->va_end - va->va_start; + vm->size = va_size(va); va->vm = vm; } @@ -2131,23 +2131,18 @@ reclaim_list_global(struct list_head *head) static void decay_va_pool_node(struct vmap_node *vn, bool full_decay) { + LIST_HEAD(decay_list); + struct rb_root decay_root = RB_ROOT; struct vmap_area *va, *nva; - struct list_head decay_list; - struct rb_root decay_root; unsigned long n_decay; int i; - decay_root = RB_ROOT; - INIT_LIST_HEAD(&decay_list); - for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { - struct list_head tmp_list; + LIST_HEAD(tmp_list); if (list_empty(&vn->pool[i].head)) continue; - INIT_LIST_HEAD(&tmp_list); - /* Detach the pool, so no-one can access it. */ spin_lock(&vn->pool_lock); list_replace_init(&vn->pool[i].head, &tmp_list); @@ -2198,7 +2193,7 @@ static void purge_vmap_node(struct work_struct *work) vn->nr_purged = 0; list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { - unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long nr = va_size(va) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; unsigned int vn_id = decode_vn_id(va->flags); @@ -2344,8 +2339,8 @@ static void free_vmap_area_noflush(struct vmap_area *va) if (WARN_ON_ONCE(!list_empty(&va->list))) return; - nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> - PAGE_SHIFT, &vmap_lazy_nr); + nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT, + &vmap_lazy_nr); /* * If it was request by a certain node we would like to @@ -2941,8 +2936,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (WARN_ON_ONCE(!va)) return; - debug_check_no_locks_freed((void *)va->va_start, - (va->va_end - va->va_start)); + debug_check_no_locks_freed((void *)va->va_start, va_size(va)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -3518,8 +3512,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; - gfp_t alloc_gfp = gfp; - bool nofail = gfp & __GFP_NOFAIL; struct page *page; int i; @@ -3530,9 +3522,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { - /* bulk allocator doesn't support nofail req. officially */ - gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; - while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -3550,12 +3539,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, + nr = alloc_pages_bulk_array_mempolicy_noprof(gfp, nr_pages_request, pages + nr_allocated); - else - nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, + nr = alloc_pages_bulk_array_node_noprof(gfp, nid, nr_pages_request, pages + nr_allocated); @@ -3569,30 +3557,24 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (nr != nr_pages_request) break; } - } else if (gfp & __GFP_NOFAIL) { - /* - * Higher order nofail allocations are really expensive and - * potentially dangerous (pre-mature OOM, disruptive reclaim - * and compaction etc. - */ - alloc_gfp &= ~__GFP_NOFAIL; } /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { - if (!nofail && fatal_signal_pending(current)) + if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current)) break; if (nid == NUMA_NO_NODE) - page = alloc_pages_noprof(alloc_gfp, order); + page = alloc_pages_noprof(gfp, order); else - page = alloc_pages_node_noprof(nid, alloc_gfp, order); + page = alloc_pages_node_noprof(nid, gfp, order); + if (unlikely(!page)) break; /* - * Higher order allocations must be able to be treated as - * indepdenent small pages by callers (as they can with + * High-order allocations must be able to be treated as + * independent small pages by callers (as they can with * small-page vmallocs). Some drivers do their own refcounting * on vmalloc_to_page() pages, some use page->mapping, * page->lru, etc. @@ -3653,7 +3635,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); - area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + /* + * High-order nofail allocations are really expensive and + * potentially dangerous (pre-mature OOM, disruptive reclaim + * and compaction etc. + * + * Please note, the __vmalloc_node_range_noprof() falls-back + * to order-0 pages if high-order attempt is unsuccessful. + */ + area->nr_pages = vm_area_alloc_pages((page_order ? + gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); @@ -4033,6 +4024,76 @@ void *vzalloc_node_noprof(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node_noprof); +/** + * vrealloc - reallocate virtually contiguous memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and + * @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or vfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory; %NULL if @size is zero or in case of + * failure + */ +void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + size_t old_size = 0; + void *n; + + if (!size) { + vfree(p); + return NULL; + } + + if (p) { + struct vm_struct *vm; + + vm = find_vm_area(p); + if (unlikely(!vm)) { + WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); + return NULL; + } + + old_size = get_vm_area_size(vm); + } + + /* + * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What + * would be a good heuristic for when to shrink the vm_area? + */ + if (size <= old_size) { + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) + memset((void *)p + size, 0, old_size - size); + + return (void *)p; + } + + /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ + n = __vmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + memcpy(n, p, old_size); + vfree(p); + } + + return n; +} + #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) @@ -4873,7 +4934,7 @@ static void show_purge_info(struct seq_file *m) list_for_each_entry(va, &vn->lazy.head, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + va_size(va)); } spin_unlock(&vn->lazy.lock); } @@ -4895,7 +4956,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p) if (va->flags & VMAP_RAM) seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + va_size(va)); continue; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a8d61a8b6894..749cdc110c74 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -628,7 +628,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, - struct swap_iocb **plug) + struct swap_iocb **plug, struct list_head *folio_list) { /* * If the folio is dirty, only perform writeback if that write @@ -676,6 +676,14 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, .swap_plug = plug, }; + /* + * The large shmem folio can be split if CONFIG_THP_SWAP is + * not enabled or contiguous swap entries are failed to + * allocate. + */ + if (shmem_mapping(mapping) && folio_test_large(folio)) + wbc.list = folio_list; + folio_set_reclaim(folio); res = mapping->a_ops->writepage(&folio->page, &wbc); if (res < 0) @@ -863,7 +871,12 @@ static enum folio_references folio_check_references(struct folio *folio, if (vm_flags & VM_LOCKED) return FOLIOREF_ACTIVATE; - /* rmap lock contention: rotate */ + /* + * There are two cases to consider. + * 1) Rmap lock contention: rotate. + * 2) Skip the non-shared swapbacked folio mapped solely by + * the exiting or OOM-reaped process. + */ if (referenced_ptes == -1) return FOLIOREF_KEEP; @@ -1003,9 +1016,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); - mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(), - nr_succeeded); - return nr_succeeded; } @@ -1222,13 +1232,14 @@ retry: goto keep_locked; if (folio_test_large(folio)) { /* cannot split folio, skip it */ - if (!can_split_folio(folio, NULL)) + if (!can_split_folio(folio, 1, NULL)) goto activate_locked; /* * Split partially mapped folios right away. * We can free the unmapped pages without IO. */ - if (data_race(!list_empty(&folio->_deferred_list)) && + if (data_race(!list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) && split_folio_to_list(folio, folio_list)) goto activate_locked; } @@ -1252,11 +1263,6 @@ retry: goto activate_locked_split; } } - } else if (folio_test_swapbacked(folio) && - folio_test_large(folio)) { - /* Split shmem folio */ - if (split_folio_to_list(folio, folio_list)) - goto keep_locked; } /* @@ -1357,12 +1363,25 @@ retry: * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(folio, mapping, &plug)) { + switch (pageout(folio, mapping, &plug, folio_list)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: + /* + * If shmem folio is split when writeback to swap, + * the tail pages will make their own pass through + * this function and be accounted then. + */ + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } goto activate_locked; case PAGE_SUCCESS: + if (nr_pages > 1 && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } stat->nr_pageout += nr_pages; if (folio_test_writeback(folio)) @@ -1495,7 +1514,8 @@ keep: /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_reclaimed += demote_folio_list(&demote_folios, pgdat); + stat->nr_demoted = demote_folio_list(&demote_folios, pgdat); + nr_reclaimed += stat->nr_demoted; /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { /* Folios which weren't demoted go back on @folio_list */ @@ -1941,6 +1961,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); move_folios_to_lru(lruvec, &folio_list); + __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), + stat.nr_demoted); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) @@ -2239,10 +2261,11 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); /* - * Flush the memory cgroup stats, so that we read accurate per-memcg - * lruvec stats for heuristics. + * Flush the memory cgroup stats in rate-limited way as we don't need + * most accurate stats here. We may switch to regular stats flushing + * in the future once it is cheap enough. */ - mem_cgroup_flush_stats(sc->target_mem_cgroup); + mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup); /* * Determine the scan balance between anon and file LRUs. @@ -3456,7 +3479,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area goto next; if (!pmd_trans_huge(pmd[i])) { - if (should_clear_pmd_young()) + if (!walk->force_scan && should_clear_pmd_young()) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } @@ -3543,7 +3566,7 @@ restart: walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (should_clear_pmd_young()) { + if (!walk->force_scan && should_clear_pmd_young()) { if (!pmd_young(val)) continue; @@ -6644,7 +6667,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) continue; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) - mark = wmark_pages(zone, WMARK_PROMO); + mark = promo_wmark_pages(zone); else mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) @@ -7519,7 +7542,9 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) ret = __node_reclaim(pgdat, gfp_mask, order); clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); - if (!ret) + if (ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS); + else count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); return ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index e875f2a4915f..b5a4cea423e1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1314,6 +1314,7 @@ const char * const vmstat_text[] = { "pgsteal_file", #ifdef CONFIG_NUMA + "zone_reclaim_success", "zone_reclaim_failed", #endif "pginodesteal", @@ -1384,6 +1385,7 @@ const char * const vmstat_text[] = { "thp_split_page", "thp_split_page_failed", "thp_deferred_split_page", + "thp_underused_split_page", "thp_split_pmd", "thp_scan_exceed_none_pte", "thp_scan_exceed_swap_pte", @@ -1435,6 +1437,30 @@ const char * const vmstat_text[] = { "vma_lock_retry", "vma_lock_miss", #endif +#ifdef CONFIG_DEBUG_STACK_USAGE + "kstack_1k", +#if THREAD_SIZE > 1024 + "kstack_2k", +#endif +#if THREAD_SIZE > 2048 + "kstack_4k", +#endif +#if THREAD_SIZE > 4096 + "kstack_8k", +#endif +#if THREAD_SIZE > 8192 + "kstack_16k", +#endif +#if THREAD_SIZE > 16384 + "kstack_32k", +#endif +#if THREAD_SIZE > 32768 + "kstack_64k", +#endif +#if THREAD_SIZE > 65536 + "kstack_rest", +#endif +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ @@ -1718,6 +1744,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" + "\n promo %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu" @@ -1727,6 +1754,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), + promo_wmark_pages(zone), zone->spanned_pages, zone->present_pages, zone_managed_pages(zone), diff --git a/mm/z3fold.c b/mm/z3fold.c index 2ebfed32871b..379d24b4fef9 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -144,7 +144,7 @@ struct z3fold_pool { const char *name; spinlock_t lock; spinlock_t stale_lock; - struct list_head *unbuddied; + struct list_head __percpu *unbuddied; struct list_head stale; atomic64_t pages_nr; struct kmem_cache *c_handle; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b572aa84823c..16a07def09c9 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -20,7 +20,7 @@ * page->index: links together all component pages of a zspage * For the huge page, this is always 0, so we use this field * to store handle. - * page->page_type: PG_zsmalloc, lower 16 bit locate the first object + * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object * offset in a subpage of a zspage * * Usage of struct page flags: @@ -463,13 +463,7 @@ static inline struct page *get_first_page(struct zspage *zspage) return first_page; } -#define FIRST_OBJ_PAGE_TYPE_MASK 0xffff - -static inline void reset_first_obj_offset(struct page *page) -{ - VM_WARN_ON_ONCE(!PageZsmalloc(page)); - page->page_type |= FIRST_OBJ_PAGE_TYPE_MASK; -} +#define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff static inline unsigned int get_first_obj_offset(struct page *page) { @@ -479,8 +473,8 @@ static inline unsigned int get_first_obj_offset(struct page *page) static inline void set_first_obj_offset(struct page *page, unsigned int offset) { - /* With 16 bit available, we can support offsets into 64 KiB pages. */ - BUILD_BUG_ON(PAGE_SIZE > SZ_64K); + /* With 24 bits available, we can support offsets into 16 MiB pages. */ + BUILD_BUG_ON(PAGE_SIZE > SZ_16M); VM_WARN_ON_ONCE(!PageZsmalloc(page)); VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK; @@ -819,7 +813,6 @@ static void reset_page(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); page->index = 0; - reset_first_obj_offset(page); __ClearPageZsmalloc(page); } diff --git a/mm/zswap.c b/mm/zswap.c index adeaf9c97fde..449914ea9919 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -44,8 +44,6 @@ **********************************/ /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); -/* The number of same-value filled pages currently stored in zswap */ -static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -185,8 +183,11 @@ static struct shrinker *zswap_shrinker; * * swpentry - associated swap entry, the offset indexes into the red-black tree * length - the length in bytes of the compressed page data. Needed during - * decompression. For a same value filled page length is 0, and both - * pool and lru are invalid and must be ignored. + * decompression. + * referenced - true if the entry recently entered the zswap pool. Unset by the + * writeback logic. The entry is only reclaimed by the writeback + * logic if referenced is unset. See comments in the shrinker + * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content @@ -196,11 +197,9 @@ static struct shrinker *zswap_shrinker; struct zswap_entry { swp_entry_t swpentry; unsigned int length; + bool referenced; struct zswap_pool *pool; - union { - unsigned long handle; - unsigned long value; - }; + unsigned long handle; struct obj_cgroup *objcg; struct list_head lru; }; @@ -700,11 +699,8 @@ static inline int entry_to_nid(struct zswap_entry *entry) static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { - atomic_long_t *nr_zswap_protected; - unsigned long lru_size, old, new; int nid = entry_to_nid(entry); struct mem_cgroup *memcg; - struct lruvec *lruvec; /* * Note that it is safe to use rcu_read_lock() here, even in the face of @@ -722,19 +718,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) memcg = mem_cgroup_from_entry(entry); /* will always succeed */ list_lru_add(list_lru, &entry->lru, nid, memcg); - - /* Update the protection area */ - lru_size = list_lru_count_one(list_lru, nid, memcg); - lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected; - old = atomic_long_inc_return(nr_zswap_protected); - /* - * Decay to avoid overflow and adapt to changing workloads. - * This is based on LRU reclaim cost decaying heuristics. - */ - do { - new = old > lru_size / 4 ? old / 2 : old; - } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); rcu_read_unlock(); } @@ -752,7 +735,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry) void zswap_lruvec_state_init(struct lruvec *lruvec) { - atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); + atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0); } void zswap_folio_swapin(struct folio *folio) @@ -761,16 +744,29 @@ void zswap_folio_swapin(struct folio *folio) if (folio) { lruvec = folio_lruvec(folio); - atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins); } } +/* + * This function should be called when a memcg is being offlined. + * + * Since the global shrinker shrink_worker() may hold a reference + * of the memcg, we must check and release the reference in + * zswap_next_shrink. + * + * shrink_worker() must handle the case where this function releases + * the reference of memcg being shrunk. + */ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) { /* lock out zswap shrinker walking memcg tree */ spin_lock(&zswap_shrink_lock); - if (zswap_next_shrink == memcg) - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + if (zswap_next_shrink == memcg) { + do { + zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + } while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink)); + } spin_unlock(&zswap_shrink_lock); } @@ -799,13 +795,9 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) */ static void zswap_entry_free(struct zswap_entry *entry) { - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zswap_lru_del(&zswap_list_lru, entry); - zpool_free(entry->pool->zpool, entry->handle); - zswap_pool_put(entry->pool); - } + zswap_lru_del(&zswap_list_lru, entry); + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); @@ -1082,6 +1074,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry, /********************************* * shrinker functions **********************************/ +/* + * The dynamic shrinker is modulated by the following factors: + * + * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving + * the entry a second chance) before rotating it in the LRU list. If the + * entry is considered again by the shrinker, with its referenced bit unset, + * it is written back. The writeback rate as a result is dynamically + * adjusted by the pool activities - if the pool is dominated by new entries + * (i.e lots of recent zswapouts), these entries will be protected and + * the writeback rate will slow down. On the other hand, if the pool has a + * lot of stagnant entries, these entries will be reclaimed immediately, + * effectively increasing the writeback rate. + * + * 2. Swapins counter: If we observe swapins, it is a sign that we are + * overshrinking and should slow down. We maintain a swapins counter, which + * is consumed and subtract from the number of eligible objects on the LRU + * in zswap_shrinker_count(). + * + * 3. Compression ratio. The better the workload compresses, the less gains we + * can expect from writeback. We scale down the number of objects available + * for reclaim by this ratio. + */ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, spinlock_t *lock, void *arg) { @@ -1092,6 +1106,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o int writeback_result; /* + * Second chance algorithm: if the entry has its referenced bit set, give it + * a second chance. Only clear the referenced bit and rotate it in the + * zswap's LRU list. + */ + if (entry->referenced) { + entry->referenced = false; + return LRU_ROTATE; + } + + /* * As soon as we drop the LRU lock, the entry can be freed by * a concurrent invalidation. This means the following: * @@ -1157,8 +1181,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); - unsigned long shrink_ret, nr_protected, lru_size; + unsigned long shrink_ret; bool encountered_page_in_swapcache = false; if (!zswap_shrinker_enabled || @@ -1167,25 +1190,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, return SHRINK_STOP; } - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); - lru_size = list_lru_shrink_count(&zswap_list_lru, sc); - - /* - * Abort if we are shrinking into the protected region. - * - * This short-circuiting is necessary because if we have too many multiple - * concurrent reclaimers getting the freeable zswap object counts at the - * same time (before any of them made reasonable progress), the total - * number of reclaimed objects might be more than the number of unprotected - * objects (i.e the reclaimers will reclaim into the protected area of the - * zswap LRU). - */ - if (nr_protected >= lru_size - sc->nr_to_scan) { - sc->nr_scanned = 0; - return SHRINK_STOP; - } - shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb, &encountered_page_in_swapcache); @@ -1200,7 +1204,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, { struct mem_cgroup *memcg = sc->memcg; struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); - unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + atomic_long_t *nr_disk_swapins = + &lruvec->zswap_lruvec_state.nr_disk_swapins; + unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur, + nr_remain; if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; @@ -1233,25 +1240,33 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!nr_stored) return 0; - nr_protected = - atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc); + if (!nr_freeable) + return 0; + /* - * Subtract the lru size by an estimate of the number of pages - * that should be protected. + * Subtract from the lru size the number of pages that are recently swapped + * in from disk. The idea is that had we protect the zswap's LRU by this + * amount of pages, these disk swapins would not have happened. */ - nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins); + do { + if (nr_freeable >= nr_disk_swapins_cur) + nr_remain = 0; + else + nr_remain = nr_disk_swapins_cur - nr_freeable; + } while (!atomic_long_try_cmpxchg( + nr_disk_swapins, &nr_disk_swapins_cur, nr_remain)); + + nr_freeable -= nr_disk_swapins_cur - nr_remain; + if (!nr_freeable) + return 0; /* * Scale the number of freeable pages by the memory saving factor. * This ensures that the better zswap compresses memory, the fewer * pages we will evict to swap (as it will otherwise incur IO for * relatively small memory saving). - * - * The memory saving factor calculated here takes same-filled pages into - * account, but those are not freeable since they almost occupy no - * space. Hence, we may scale nr_freeable down a little bit more than we - * should if we have a lot of same-filled pages. */ return mult_frac(nr_freeable, nr_backing, nr_stored); } @@ -1274,10 +1289,10 @@ static struct shrinker *zswap_alloc_shrinker(void) static int shrink_memcg(struct mem_cgroup *memcg) { - int nid, shrunk = 0; + int nid, shrunk = 0, scanned = 0; if (!mem_cgroup_zswap_writeback_enabled(memcg)) - return -EINVAL; + return -ENOENT; /* * Skip zombies because their LRUs are reparented and we would be @@ -1291,63 +1306,94 @@ static int shrink_memcg(struct mem_cgroup *memcg) shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb, NULL, &nr_to_walk); + scanned += 1 - nr_to_walk; } + + if (!scanned) + return -ENOENT; + return shrunk ? 0 : -EAGAIN; } static void shrink_worker(struct work_struct *w) { struct mem_cgroup *memcg; - int ret, failures = 0; + int ret, failures = 0, attempts = 0; unsigned long thr; /* Reclaim down to the accept threshold */ thr = zswap_accept_thr_pages(); - /* global reclaim will select cgroup in a round-robin fashion. */ + /* + * Global reclaim will select cgroup in a round-robin fashion from all + * online memcgs, but memcgs that have no pages in zswap and + * writeback-disabled memcgs (memory.zswap.writeback=0) are not + * candidates for shrinking. + * + * Shrinking will be aborted if we encounter the following + * MAX_RECLAIM_RETRIES times: + * - No writeback-candidate memcgs found in a memcg tree walk. + * - Shrinking a writeback-candidate memcg failed. + * + * We save iteration cursor memcg into zswap_next_shrink, + * which can be modified by the offline memcg cleaner + * zswap_memcg_offline_cleanup(). + * + * Since the offline cleaner is called only once, we cannot leave an + * offline memcg reference in zswap_next_shrink. + * We can rely on the cleaner only if we get online memcg under lock. + * + * If we get an offline memcg, we cannot determine if the cleaner has + * already been called or will be called later. We must put back the + * reference before returning from this function. Otherwise, the + * offline memcg left in zswap_next_shrink will hold the reference + * until the next run of shrink_worker(). + */ do { - spin_lock(&zswap_shrink_lock); - zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); - memcg = zswap_next_shrink; - /* - * We need to retry if we have gone through a full round trip, or if we - * got an offline memcg (or else we risk undoing the effect of the - * zswap memcg offlining cleanup callback). This is not catastrophic - * per se, but it will keep the now offlined memcg hostage for a while. + * Start shrinking from the next memcg after zswap_next_shrink. + * When the offline cleaner has already advanced the cursor, + * advancing the cursor here overlooks one memcg, but this + * should be negligibly rare. * - * Note that if we got an online memcg, we will keep the extra - * reference in case the original reference obtained by mem_cgroup_iter - * is dropped by the zswap memcg offlining callback, ensuring that the - * memcg is not killed when we are reclaiming. + * If we get an online memcg, keep the extra reference in case + * the original one obtained by mem_cgroup_iter() is dropped by + * zswap_memcg_offline_cleanup() while we are shrinking the + * memcg. */ - if (!memcg) { - spin_unlock(&zswap_shrink_lock); - if (++failures == MAX_RECLAIM_RETRIES) - break; - - goto resched; - } - - if (!mem_cgroup_tryget_online(memcg)) { - /* drop the reference from mem_cgroup_iter() */ - mem_cgroup_iter_break(NULL, memcg); - zswap_next_shrink = NULL; - spin_unlock(&zswap_shrink_lock); + spin_lock(&zswap_shrink_lock); + do { + memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL); + zswap_next_shrink = memcg; + } while (memcg && !mem_cgroup_tryget_online(memcg)); + spin_unlock(&zswap_shrink_lock); - if (++failures == MAX_RECLAIM_RETRIES) + if (!memcg) { + /* + * Continue shrinking without incrementing failures if + * we found candidate memcgs in the last tree walk. + */ + if (!attempts && ++failures == MAX_RECLAIM_RETRIES) break; + attempts = 0; goto resched; } - spin_unlock(&zswap_shrink_lock); ret = shrink_memcg(memcg); /* drop the extra reference */ mem_cgroup_put(memcg); - if (ret == -EINVAL) - break; + /* + * There are no writeback-candidate pages in the memcg. + * This is not an issue as long as we can find another memcg + * with pages in zswap. Skip this without incrementing attempts + * and failures. + */ + if (ret == -ENOENT) + continue; + ++attempts; + if (ret && ++failures == MAX_RECLAIM_RETRIES) break; resched: @@ -1356,42 +1402,6 @@ resched: } /********************************* -* same-filled functions -**********************************/ -static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value) -{ - unsigned long *data; - unsigned long val; - unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1; - bool ret = false; - - data = kmap_local_folio(folio, 0); - val = data[0]; - - if (val != data[last_pos]) - goto out; - - for (pos = 1; pos < last_pos; pos++) { - if (val != data[pos]) - goto out; - } - - *value = val; - ret = true; -out: - kunmap_local(data); - return ret; -} - -static void zswap_fill_folio(struct folio *folio, unsigned long value) -{ - unsigned long *data = kmap_local_folio(folio, 0); - - memset_l(data, value, PAGE_SIZE / sizeof(unsigned long)); - kunmap_local(data); -} - -/********************************* * main API **********************************/ bool zswap_store(struct folio *folio) @@ -1402,7 +1412,6 @@ bool zswap_store(struct folio *folio) struct zswap_entry *entry, *old; struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; - unsigned long value; VM_WARN_ON_ONCE(!folio_test_locked(folio)); VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); @@ -1435,13 +1444,6 @@ bool zswap_store(struct folio *folio) goto reject; } - if (zswap_is_folio_same_filled(folio, &value)) { - entry->length = 0; - entry->value = value; - atomic_inc(&zswap_same_filled_pages); - goto store_entry; - } - /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) @@ -1459,9 +1461,9 @@ bool zswap_store(struct folio *folio) if (!zswap_compress(folio, entry)) goto put_pool; -store_entry: entry->swpentry = swp; entry->objcg = objcg; + entry->referenced = true; old = xa_store(tree, offset, entry, GFP_KERNEL); if (xa_is_err(old)) { @@ -1507,13 +1509,9 @@ store_entry: return true; store_failed: - if (!entry->length) - atomic_dec(&zswap_same_filled_pages); - else { - zpool_free(entry->pool->zpool, entry->handle); + zpool_free(entry->pool->zpool, entry->handle); put_pool: - zswap_pool_put(entry->pool); - } + zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); reject: @@ -1576,10 +1574,7 @@ bool zswap_load(struct folio *folio) if (!entry) return false; - if (entry->length) - zswap_decompress(entry, folio); - else - zswap_fill_folio(folio, entry->value); + zswap_decompress(entry, folio); count_vm_event(ZSWPIN); if (entry->objcg) @@ -1682,8 +1677,6 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_atomic_t("stored_pages", 0444, zswap_debugfs_root, &zswap_stored_pages); - debugfs_create_atomic_t("same_filled_pages", 0444, - zswap_debugfs_root, &zswap_same_filled_pages); return 0; } |