diff options
Diffstat (limited to 'mm')
84 files changed, 4122 insertions, 3339 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c1acc34c1c35..f2104cc0d35c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -126,9 +126,6 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. -config HAVE_MEMBLOCK_NODE_MAP - bool - config HAVE_MEMBLOCK_PHYS_MAP bool @@ -136,6 +133,9 @@ config HAVE_FAST_GUP depends on MMU bool +# Don't discard allocated memory used to track "memory" and "reserved" memblocks +# after early boot, so it can still be used to test for validity of memory. +# Also, memblocks are updated with memory hot(un)plug. config ARCH_KEEP_MEMBLOCK bool @@ -158,6 +158,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG + depends on 64BIT || BROKEN select NUMA_KEEP_MEMINFO if NUMA config MEMORY_HOTPLUG_SPARSE @@ -192,6 +193,9 @@ config MEMORY_HOTREMOVE # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# SPARC32 allocates multiple pte tables within a single page, and therefore +# a per-page lock leads to problems when multiple tables need to be locked +# at the same time (e.g. copy_page_range()). # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # config SPLIT_PTLOCK_CPUS @@ -199,6 +203,7 @@ config SPLIT_PTLOCK_CPUS default "999999" if !MMU default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 + default "999999" if SPARC32 default "4" config ARCH_ENABLE_SPLIT_PMD_PTLOCK @@ -705,9 +710,9 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config PGTABLE_MAPPING +config ZSMALLOC_PGTABLE_MAPPING bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC + depends on ZSMALLOC=y help By default, zsmalloc uses a copy-based object mapping method to access allocations that span two pages. However, if a particular @@ -750,13 +755,13 @@ config DEFERRED_STRUCT_PAGE_INIT depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT + select PADATA help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up - a subset of memmap at boot and then initialise the rest in parallel - by starting one-off "pgdatinitX" kernel thread for each node X. This - has a potential performance impact on processes running early in the + a subset of memmap at boot and then initialise the rest in parallel. + This has a potential performance impact on tasks running early in the lifetime of the system until these kthreads finish the initialisation. diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 0271b22e063f..864f129f1937 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config PAGE_EXTENSION bool "Extend memmap on extra space for more information on page" - ---help--- + help Extend memmap on extra space for more information on page. This could be used for debugging features that need to insert extra field for every page. This extension enables us to save memory @@ -13,7 +13,7 @@ config DEBUG_PAGEALLOC depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC - ---help--- + help Unmap pages from the kernel linear mapping after free_pages(). Depending on runtime enablement, this results in a small or large slowdown, but helps to find certain types of memory corruption. @@ -41,7 +41,7 @@ config DEBUG_PAGEALLOC config DEBUG_PAGEALLOC_ENABLE_DEFAULT bool "Enable debug page memory allocations by default?" depends on DEBUG_PAGEALLOC - ---help--- + help Enable debug page memory allocations by default? This value can be overridden by debug_pagealloc=off|on. @@ -65,7 +65,7 @@ config PAGE_OWNER config PAGE_POISONING bool "Poison pages after freeing" select PAGE_POISONING_NO_SANITY if HIBERNATION - ---help--- + help Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps reduce the risk of information leaks from freed data. This does @@ -80,7 +80,7 @@ config PAGE_POISONING config PAGE_POISONING_NO_SANITY depends on PAGE_POISONING bool "Only poison, don't sanity check" - ---help--- + help Skip the sanity checking on alloc, only fill the pages with poison on free. This reduces some of the overhead of the poisoning feature. @@ -91,7 +91,7 @@ config PAGE_POISONING_NO_SANITY config PAGE_POISONING_ZERO bool "Use zero for poisoning instead of debugging value" depends on PAGE_POISONING - ---help--- + help Instead of using the existing poison value, fill the pages with zeros. This makes it harder to detect when errors are occurring due to sanitization but the zeroing at free means that it is @@ -104,7 +104,7 @@ config DEBUG_PAGE_REF bool "Enable tracepoint to track down page reference manipulation" depends on DEBUG_KERNEL depends on TRACEPOINTS - ---help--- + help This is a feature to add tracepoint for tracking down page reference manipulation. This tracking is useful to diagnose functional failure due to migration failures caused by page reference mismatches. Be @@ -115,9 +115,41 @@ config DEBUG_PAGE_REF config DEBUG_RODATA_TEST bool "Testcase for the marking rodata read-only" depends on STRICT_KERNEL_RWX - ---help--- + help This option enables a testcase for the setting rodata read-only. +config ARCH_HAS_DEBUG_WX + bool + +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on ARCH_HAS_DEBUG_WX + depends on MMU + select PTDUMP_CORE + help + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving W+X + mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + <arch>/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + <arch>/mm: Checked W+X mappings: failed, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config GENERIC_PTDUMP bool diff --git a/mm/Makefile b/mm/Makefile index fccd3756b25f..6e9d46b2efc9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -8,6 +8,14 @@ KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n KCSAN_SANITIZE_kmemleak.o := n +# These produce frequent data race reports: most of them are due to races on +# the same word but accesses to different bits of that word. Re-enable KCSAN +# for these when we have more consensus on what to do about them. +KCSAN_SANITIZE_slab_common.o := n +KCSAN_SANITIZE_slab.o := n +KCSAN_SANITIZE_slub.o := n +KCSAN_SANITIZE_page_alloc.o := n + # These files are disabled because they produce non-interesting and/or # flaky coverage that is not a function of syscall inputs. E.g. slab is out of # free pages, or a task is migrated between nodes. @@ -41,7 +49,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ - mm_init.o mmu_context.o percpu.o slab_common.o \ + mm_init.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o $(mmu-y) @@ -88,6 +96,7 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o +obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c81b4f3a7268..d382272bcc31 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -15,13 +15,12 @@ #include <trace/events/writeback.h> struct backing_dev_info noop_backing_dev_info = { - .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; -const char *bdi_unknown_name = "(unknown)"; +static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU @@ -865,12 +864,11 @@ static int bdi_init(struct backing_dev_info *bdi) return ret; } -struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) +struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; - bdi = kmalloc_node(sizeof(struct backing_dev_info), - gfp_mask | __GFP_ZERO, node_id); + bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; @@ -880,7 +878,7 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) } return bdi; } -EXPORT_SYMBOL(bdi_alloc_node); +EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { @@ -938,7 +936,8 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; - dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args); + vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); + dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); @@ -963,7 +962,6 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) trace_writeback_bdi_register(bdi); return 0; } -EXPORT_SYMBOL(bdi_register_va); int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { @@ -977,20 +975,12 @@ int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) } EXPORT_SYMBOL(bdi_register); -int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) +void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { - int rc; - - rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt)); - if (rc) - return rc; - /* Leaking owner reference... */ - WARN_ON(bdi->owner); + WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); - return 0; } -EXPORT_SYMBOL(bdi_register_owner); /* * Remove bdi from bdi_list, and ensure that it is no longer visible @@ -1043,6 +1033,14 @@ void bdi_put(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_put); +const char *bdi_dev_name(struct backing_dev_info *bdi) +{ + if (!bdi || !bdi->dev) + return bdi_unknown_name; + return bdi->dev_name; +} +EXPORT_SYMBOL_GPL(bdi_dev_name); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..fd988b7e5f2b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1401,7 +1401,7 @@ fast_isolate_freepages(struct compact_control *cc) if (scan_start) { /* * Use the highest PFN found above min. If one was - * not found, be pessemistic for direct compaction + * not found, be pessimistic for direct compaction * and use the min mark. */ if (highest) { @@ -1409,7 +1409,9 @@ fast_isolate_freepages(struct compact_control *cc) cc->free_pfn = highest; } else { if (cc->direct_compaction && pfn_valid(min_pfn)) { - page = pfn_to_page(min_pfn); + page = pageblock_pfn_to_page(min_pfn, + pageblock_end_pfn(min_pfn), + cc->zone); cc->free_pfn = min_pfn; } } @@ -1966,7 +1968,7 @@ static enum compact_result compact_finished(struct compact_control *cc) */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx, + int highest_zoneidx, unsigned long wmark_target) { unsigned long watermark; @@ -1979,7 +1981,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ - if (zone_watermark_ok(zone, order, watermark, classzone_idx, + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, alloc_flags)) return COMPACT_SUCCESS; @@ -1989,9 +1991,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * watermark and alloc_flags have to match, or be more pessimistic than * the check in __isolate_free_page(). We don't use the direct * compactor's alloc_flags, as they are not relevant for freepage - * isolation. We however do use the direct compactor's classzone_idx to - * skip over zones where lowmem reserves would prevent allocation even - * if compaction succeeds. + * isolation. We however do use the direct compactor's highest_zoneidx + * to skip over zones where lowmem reserves would prevent allocation + * even if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. * ALLOC_CMA is used, as pages in CMA pageblocks are considered @@ -2000,7 +2002,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); - if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; @@ -2009,12 +2011,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx) + int highest_zoneidx) { enum compact_result ret; int fragindex; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, zone_page_state(zone, NR_FREE_PAGES)); /* * fragmentation index determines if allocation failures are due to @@ -2055,8 +2057,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, * Make sure at least one zone would pass __compaction_suitable if we continue * retrying the reclaim. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; enum compact_result compact_result; @@ -2069,7 +2071,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available = zone_reclaimable_pages(zone) / order; available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, - ac_classzone_idx(ac), available); + ac->highest_zoneidx, available); if (compact_result != COMPACT_SKIPPED) return true; } @@ -2098,9 +2100,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) INIT_LIST_HEAD(&cc->freepages); INIT_LIST_HEAD(&cc->migratepages); - cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); + cc->migratetype = gfp_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, - cc->classzone_idx); + cc->highest_zoneidx); /* Compaction is likely to fail */ if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; @@ -2243,15 +2245,11 @@ check_drain: * would succeed. */ if (cc->order > 0 && last_migrated_pfn) { - int cpu; unsigned long current_block_start = block_start_pfn(cc->migrate_pfn, cc->order); if (last_migrated_pfn < current_block_start) { - cpu = get_cpu(); - lru_add_drain_cpu(cpu); - drain_local_pages(cc->zone); - put_cpu(); + lru_add_drain_cpu_zone(cc->zone); /* No more flushing until we migrate again */ last_migrated_pfn = 0; } @@ -2295,7 +2293,7 @@ out: static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum compact_priority prio, - unsigned int alloc_flags, int classzone_idx, + unsigned int alloc_flags, int highest_zoneidx, struct page **capture) { enum compact_result ret; @@ -2307,7 +2305,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .mode = (prio == COMPACT_PRIO_ASYNC) ? MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, .alloc_flags = alloc_flags, - .classzone_idx = classzone_idx, + .highest_zoneidx = highest_zoneidx, .direct_compaction = true, .whole_zone = (prio == MIN_COMPACT_PRIORITY), .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), @@ -2363,8 +2361,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); /* Compact each zone in the list */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { enum compact_result status; if (prio > MIN_COMPACT_PRIORITY @@ -2374,7 +2372,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, } status = compact_zone_order(zone, order, gfp_mask, prio, - alloc_flags, ac_classzone_idx(ac), capture); + alloc_flags, ac->highest_zoneidx, capture); rc = max(status, rc); /* The allocation should succeed, stop compacting */ @@ -2463,7 +2461,7 @@ int sysctl_compact_memory; * /proc/sys/vm/compact_memory */ int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { if (write) compact_nodes(); @@ -2509,16 +2507,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) { int zoneid; struct zone *zone; - enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; - for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) continue; if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, - classzone_idx) == COMPACT_CONTINUE) + highest_zoneidx) == COMPACT_CONTINUE) return true; } @@ -2536,16 +2534,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .classzone_idx = pgdat->kcompactd_classzone_idx, + .highest_zoneidx = pgdat->kcompactd_highest_zoneidx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, - cc.classzone_idx); + cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); - for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) { int status; zone = &pgdat->node_zones[zoneid]; @@ -2594,16 +2592,16 @@ static void kcompactd_do_work(pg_data_t *pgdat) /* * Regardless of success, we are done until woken up next. But remember - * the requested order/classzone_idx in case it was higher/tighter than - * our current ones + * the requested order/highest_zoneidx in case it was higher/tighter + * than our current ones */ if (pgdat->kcompactd_max_order <= cc.order) pgdat->kcompactd_max_order = 0; - if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; } -void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) { if (!order) return; @@ -2611,8 +2609,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kcompactd_max_order < order) pgdat->kcompactd_max_order = order; - if (pgdat->kcompactd_classzone_idx > classzone_idx) - pgdat->kcompactd_classzone_idx = classzone_idx; + if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = highest_zoneidx; /* * Pairs with implicit barrier in wait_event_freezable() @@ -2625,7 +2623,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) return; trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, - classzone_idx); + highest_zoneidx); wake_up_interruptible(&pgdat->kcompactd_wait); } @@ -2646,7 +2644,7 @@ static int kcompactd(void *p) set_freezable(); pgdat->kcompactd_max_order = 0; - pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; while (!kthread_should_stop()) { unsigned long pflags; diff --git a/mm/debug.c b/mm/debug.c index 2189357f0987..4f376514744d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -110,13 +110,57 @@ void __dump_page(struct page *page, const char *reason) else if (PageAnon(page)) type = "anon "; else if (mapping) { - if (mapping->host && mapping->host->i_dentry.first) { - struct dentry *dentry; - dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); - } else - pr_warn("%ps\n", mapping->a_ops); + const struct inode *host; + const struct address_space_operations *a_ops; + const struct hlist_node *dentry_first; + const struct dentry *dentry_ptr; + struct dentry dentry; + + /* + * mapping can be invalid pointer and we don't want to crash + * accessing it, so probe everything depending on it carefully + */ + if (copy_from_kernel_nofault(&host, &mapping->host, + sizeof(struct inode *)) || + copy_from_kernel_nofault(&a_ops, &mapping->a_ops, + sizeof(struct address_space_operations *))) { + pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n"); + goto out_mapping; + } + + if (!host) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + if (copy_from_kernel_nofault(&dentry_first, + &host->i_dentry.first, sizeof(struct hlist_node *))) { + pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n", + a_ops, host); + goto out_mapping; + } + + if (!dentry_first) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (copy_from_kernel_nofault(&dentry, dentry_ptr, + sizeof(struct dentry))) { + pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n", + a_ops, dentry_ptr); + } else { + /* + * if dentry is corrupted, the %pd handler may still + * crash, but it's unlikely that we reach here with a + * corrupted struct page + */ + pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n", + a_ops, &dentry); + } } +out_mapping: BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c new file mode 100644 index 000000000000..e45623016aea --- /dev/null +++ b/mm/debug_vm_pgtable.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This kernel test validates architecture page table helpers and + * accessors and helps in verifying their continued compliance with + * expected generic MM semantics. + * + * Copyright (C) 2019 ARM Ltd. + * + * Author: Anshuman Khandual <anshuman.khandual@arm.com> + */ +#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__ + +#include <linux/gfp.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/kernel.h> +#include <linux/kconfig.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mm_types.h> +#include <linux/module.h> +#include <linux/pfn_t.h> +#include <linux/printk.h> +#include <linux/random.h> +#include <linux/spinlock.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/start_kernel.h> +#include <linux/sched/mm.h> +#include <asm/pgalloc.h> + +#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC) + +/* + * On s390 platform, the lower 4 bits are used to identify given page table + * entry type. But these bits might affect the ability to clear entries with + * pxx_clear() because of how dynamic page table folding works on s390. So + * while loading up the entries do not change the lower 4 bits. It does not + * have affect any other platform. + */ +#define S390_MASK_BITS 4 +#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS) +#define RANDOM_NZVALUE GENMASK(7, 0) + +static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pte_t pte = pfn_pte(pfn, prot); + + WARN_ON(!pte_same(pte, pte)); + WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte)))); + WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte)))); + WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte)))); + WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte)))); + WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte)))); + WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte)))); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pmd_t pmd = pfn_pmd(pfn, prot); + + if (!has_transparent_hugepage()) + return; + + WARN_ON(!pmd_same(pmd, pmd)); + WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd)))); + WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd)))); + WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd)))); + WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd)))); + WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd)))); + WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd)))); + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pmd_bad(). + */ + WARN_ON(!pmd_bad(pmd_mkhuge(pmd))); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pud_t pud = pfn_pud(pfn, prot); + + if (!has_transparent_hugepage()) + return; + + WARN_ON(!pud_same(pud, pud)); + WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud)))); + WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud)))); + WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud)))); + WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud)))); + + if (mm_pmd_folded(mm)) + return; + + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pud_bad(). + */ + WARN_ON(!pud_bad(pud_mkhuge(pud))); +} +#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) +{ + p4d_t p4d; + + memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t)); + WARN_ON(!p4d_same(p4d, p4d)); +} + +static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot) +{ + pgd_t pgd; + + memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t)); + WARN_ON(!pgd_same(pgd, pgd)); +} + +#ifndef __PAGETABLE_PUD_FOLDED +static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) +{ + pud_t pud = READ_ONCE(*pudp); + + if (mm_pmd_folded(mm)) + return; + + pud = __pud(pud_val(pud) | RANDOM_ORVALUE); + WRITE_ONCE(*pudp, pud); + pud_clear(pudp); + pud = READ_ONCE(*pudp); + WARN_ON(!pud_none(pud)); +} + +static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, + pmd_t *pmdp) +{ + pud_t pud; + + if (mm_pmd_folded(mm)) + return; + /* + * This entry points to next level page table page. + * Hence this must not qualify as pud_bad(). + */ + pmd_clear(pmdp); + pud_clear(pudp); + pud_populate(mm, pudp, pmdp); + pud = READ_ONCE(*pudp); + WARN_ON(pud_bad(pud)); +} +#else /* !__PAGETABLE_PUD_FOLDED */ +static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { } +static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, + pmd_t *pmdp) +{ +} +#endif /* PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_P4D_FOLDED +static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) +{ + p4d_t p4d = READ_ONCE(*p4dp); + + if (mm_pud_folded(mm)) + return; + + p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); + WRITE_ONCE(*p4dp, p4d); + p4d_clear(p4dp); + p4d = READ_ONCE(*p4dp); + WARN_ON(!p4d_none(p4d)); +} + +static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, + pud_t *pudp) +{ + p4d_t p4d; + + if (mm_pud_folded(mm)) + return; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as p4d_bad(). + */ + pud_clear(pudp); + p4d_clear(p4dp); + p4d_populate(mm, p4dp, pudp); + p4d = READ_ONCE(*p4dp); + WARN_ON(p4d_bad(p4d)); +} + +static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = READ_ONCE(*pgdp); + + if (mm_p4d_folded(mm)) + return; + + pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); + WRITE_ONCE(*pgdp, pgd); + pgd_clear(pgdp); + pgd = READ_ONCE(*pgdp); + WARN_ON(!pgd_none(pgd)); +} + +static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, + p4d_t *p4dp) +{ + pgd_t pgd; + + if (mm_p4d_folded(mm)) + return; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as pgd_bad(). + */ + p4d_clear(p4dp); + pgd_clear(pgdp); + pgd_populate(mm, pgdp, p4dp); + pgd = READ_ONCE(*pgdp); + WARN_ON(pgd_bad(pgd)); +} +#else /* !__PAGETABLE_P4D_FOLDED */ +static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { } +static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { } +static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, + pud_t *pudp) +{ +} +static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, + p4d_t *p4dp) +{ +} +#endif /* PAGETABLE_P4D_FOLDED */ + +static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, + unsigned long vaddr) +{ + pte_t pte = READ_ONCE(*ptep); + + pte = __pte(pte_val(pte) | RANDOM_ORVALUE); + set_pte_at(mm, vaddr, ptep, pte); + barrier(); + pte_clear(mm, vaddr, ptep); + pte = READ_ONCE(*ptep); + WARN_ON(!pte_none(pte)); +} + +static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp) +{ + pmd_t pmd = READ_ONCE(*pmdp); + + pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); + WRITE_ONCE(*pmdp, pmd); + pmd_clear(pmdp); + pmd = READ_ONCE(*pmdp); + WARN_ON(!pmd_none(pmd)); +} + +static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pmd_t pmd; + + /* + * This entry points to next level page table page. + * Hence this must not qualify as pmd_bad(). + */ + pmd_clear(pmdp); + pmd_populate(mm, pmdp, pgtable); + pmd = READ_ONCE(*pmdp); + WARN_ON(pmd_bad(pmd)); +} + +static unsigned long __init get_random_vaddr(void) +{ + unsigned long random_vaddr, random_pages, total_user_pages; + + total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE; + + random_pages = get_random_long() % total_user_pages; + random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE; + + return random_vaddr; +} + +static int __init debug_vm_pgtable(void) +{ + struct mm_struct *mm; + pgd_t *pgdp; + p4d_t *p4dp, *saved_p4dp; + pud_t *pudp, *saved_pudp; + pmd_t *pmdp, *saved_pmdp, pmd; + pte_t *ptep; + pgtable_t saved_ptep; + pgprot_t prot; + phys_addr_t paddr; + unsigned long vaddr, pte_aligned, pmd_aligned; + unsigned long pud_aligned, p4d_aligned, pgd_aligned; + spinlock_t *uninitialized_var(ptl); + + pr_info("Validating architecture page table helpers\n"); + prot = vm_get_page_prot(VMFLAGS); + vaddr = get_random_vaddr(); + mm = mm_alloc(); + if (!mm) { + pr_err("mm_struct allocation failed\n"); + return 1; + } + + /* + * PFN for mapping at PTE level is determined from a standard kernel + * text symbol. But pfns for higher page table levels are derived by + * masking lower bits of this real pfn. These derived pfns might not + * exist on the platform but that does not really matter as pfn_pxx() + * helpers will still create appropriate entries for the test. This + * helps avoid large memory block allocations to be used for mapping + * at higher page table levels. + */ + paddr = __pa_symbol(&start_kernel); + + pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; + pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; + pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; + p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT; + pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT; + WARN_ON(!pfn_valid(pte_aligned)); + + pgdp = pgd_offset(mm, vaddr); + p4dp = p4d_alloc(mm, pgdp, vaddr); + pudp = pud_alloc(mm, p4dp, vaddr); + pmdp = pmd_alloc(mm, pudp, vaddr); + ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl); + + /* + * Save all the page table page addresses as the page table + * entries will be used for testing with random or garbage + * values. These saved addresses will be used for freeing + * page table pages. + */ + pmd = READ_ONCE(*pmdp); + saved_p4dp = p4d_offset(pgdp, 0UL); + saved_pudp = pud_offset(p4dp, 0UL); + saved_pmdp = pmd_offset(pudp, 0UL); + saved_ptep = pmd_pgtable(pmd); + + pte_basic_tests(pte_aligned, prot); + pmd_basic_tests(pmd_aligned, prot); + pud_basic_tests(pud_aligned, prot); + p4d_basic_tests(p4d_aligned, prot); + pgd_basic_tests(pgd_aligned, prot); + + pte_clear_tests(mm, ptep, vaddr); + pmd_clear_tests(mm, pmdp); + pud_clear_tests(mm, pudp); + p4d_clear_tests(mm, p4dp); + pgd_clear_tests(mm, pgdp); + + pte_unmap_unlock(ptep, ptl); + + pmd_populate_tests(mm, pmdp, saved_ptep); + pud_populate_tests(mm, pudp, saved_pmdp); + p4d_populate_tests(mm, p4dp, saved_pudp); + pgd_populate_tests(mm, pgdp, saved_p4dp); + + p4d_free(mm, saved_p4dp); + pud_free(mm, saved_pudp); + pmd_free(mm, saved_pmdp); + pte_free(mm, saved_ptep); + + mm_dec_nr_puds(mm); + mm_dec_nr_pmds(mm); + mm_dec_nr_ptes(mm); + mmdrop(mm); + return 0; +} +late_initcall(debug_vm_pgtable); diff --git a/mm/fadvise.c b/mm/fadvise.c index 4f17c83db575..0e66f2aaeea3 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -22,6 +22,8 @@ #include <asm/unistd.h> +#include "internal.h" + /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. @@ -102,10 +104,6 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) if (!nrpages) nrpages = ~0UL; - /* - * Ignore return value because fadvise() shall return - * success even if filesystem can't retrieve a hint, - */ force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: diff --git a/mm/filemap.c b/mm/filemap.c index 23a051a7ef0f..f0ae9a6308cb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -76,16 +76,16 @@ * ->i_mutex * ->i_mmap_rwsem (truncate->unmap_mapping_range) * - * ->mmap_sem + * ->mmap_lock * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * - * ->mmap_sem + * ->mmap_lock * ->lock_page (access_process_vm) * * ->i_mutex (generic_perform_write) - * ->mmap_sem (fault_in_pages_readable->do_page_fault) + * ->mmap_lock (fault_in_pages_readable->do_page_fault) * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) @@ -199,9 +199,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, nr = hpage_nr_pages(page); - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { - __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { @@ -802,21 +802,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; + mem_cgroup_migrate(old, new); + xas_lock_irqsave(&xas, flags); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!PageHuge(old)) - __dec_node_page_state(new, NR_FILE_PAGES); + __dec_lruvec_page_state(old, NR_FILE_PAGES); if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); + __inc_lruvec_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(old)) - __dec_node_page_state(new, NR_SHMEM); + __dec_lruvec_page_state(old, NR_SHMEM); if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); + __inc_lruvec_page_state(new, NR_SHMEM); xas_unlock_irqrestore(&xas, flags); - mem_cgroup_migrate(old, new); if (freepage) freepage(old); put_page(old); @@ -832,7 +833,6 @@ static int __add_to_page_cache_locked(struct page *page, { XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); - struct mem_cgroup *memcg; int error; void *old; @@ -840,17 +840,16 @@ static int __add_to_page_cache_locked(struct page *page, VM_BUG_ON_PAGE(PageSwapBacked(page), page); mapping_set_update(&xas, mapping); - if (!huge) { - error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg, false); - if (error) - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; + if (!huge) { + error = mem_cgroup_charge(page, current->mm, gfp_mask); + if (error) + goto error; + } + do { xas_lock_irq(&xas); old = xas_load(&xas); @@ -869,25 +868,23 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting */ if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); + __inc_lruvec_page_state(page, NR_FILE_PAGES); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); - if (xas_error(&xas)) + if (xas_error(&xas)) { + error = xas_error(&xas); goto error; + } - if (!huge) - mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return xas_error(&xas); + return error; } ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); @@ -1259,7 +1256,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); * instead. * * The read of PG_waiters has to be after (or concurrently with) PG_locked - * being cleared, but a memory barrier should be unneccssary since it is + * being cleared, but a memory barrier should be unnecessary since it is * in the same byte as PG_locked. */ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) @@ -1374,27 +1371,27 @@ EXPORT_SYMBOL_GPL(__lock_page_killable); /* * Return values: - * 1 - page is locked; mmap_sem is still held. + * 1 - page is locked; mmap_lock is still held. * 0 - page is not locked. - * mmap_sem has been released (up_read()), unless flags had both + * mmap_lock has been released (mmap_read_unlock(), unless flags had both * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in - * which case mmap_sem is still held. + * which case mmap_lock is still held. * * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 - * with the page locked and the mmap_sem unperturbed. + * with the page locked and the mmap_lock unperturbed. */ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { if (fault_flag_allow_retry_first(flags)) { /* - * CAUTION! In this case, mmap_sem is not released + * CAUTION! In this case, mmap_lock is not released * even though return 0. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) return 0; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (flags & FAULT_FLAG_KILLABLE) wait_on_page_locked_killable(page); else @@ -1406,7 +1403,7 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, ret = __lock_page_killable(page); if (ret) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return 0; } } else @@ -1991,7 +1988,7 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) * * total number of bytes copied, including those the were already @written * * negative error code if nothing was copied */ -static ssize_t generic_file_buffered_read(struct kiocb *iocb, +ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; @@ -2243,6 +2240,7 @@ out: file_accessed(filp); return written ? written : error; } +EXPORT_SYMBOL_GPL(generic_file_buffered_read); /** * generic_file_read_iter - generic filesystem read routine @@ -2315,14 +2313,14 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* - * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem + * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. * @page - the page to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * - * This works similar to lock_page_or_retry in that it can drop the mmap_sem. + * This works similar to lock_page_or_retry in that it can drop the mmap_lock. * It differs in that it actually returns the page locked if it returns 1 and 0 - * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin + * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin * will point to the pinned file and needs to be fput()'ed at a later point. */ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, @@ -2333,7 +2331,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, /* * NOTE! This will make us return with VM_FAULT_RETRY, but with - * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT + * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) @@ -2343,13 +2341,13 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page, if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__lock_page_killable(page)) { /* - * We didn't have the right flags to drop the mmap_sem, + * We didn't have the right flags to drop the mmap_lock, * but all fault_handlers only check for fatal signals * if we return VM_FAULT_RETRY, so we need to drop the - * mmap_sem here and return 0 if we don't have a fpin. + * mmap_lock here and return 0 if we don't have a fpin. */ if (*fpin == NULL) - up_read(&vmf->vma->vm_mm->mmap_sem); + mmap_read_unlock(vmf->vma->vm_mm); return 0; } } else @@ -2411,7 +2409,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) /* * Asynchronous readahead happens when we find the page and PG_readahead, * so we want to possibly extend the readahead further. We return the file that - * was pinned if we have to drop the mmap_sem in order to do IO. + * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct page *page) @@ -2446,12 +2444,12 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. * - * vma->vm_mm->mmap_sem must be held on entry. + * vma->vm_mm->mmap_lock must be held on entry. * - * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem + * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock * may be dropped before doing I/O or by lock_page_maybe_drop_mmap(). * - * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. @@ -2521,7 +2519,7 @@ retry_find: goto page_not_uptodate; /* - * We've made it this far and we had to drop our mmap_sem, now is the + * We've made it this far and we had to drop our mmap_lock, now is the * time to return to the upper layer and have it re-find the vma and * redo the fault. */ @@ -2566,13 +2564,12 @@ page_not_uptodate: if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS; out_retry: /* - * We dropped the mmap_sem, we need to return to the fault handler to + * We dropped the mmap_lock, we need to return to the fault handler to * re-find the vma and come back and find our hopefully still populated * page. */ @@ -2636,7 +2633,7 @@ void filemap_map_pages(struct vm_fault *vmf, if (vmf->pte) vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, NULL, page)) + if (alloc_set_pte(vmf, page)) goto unlock; unlock_page(page); goto next; diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c431ca81dad5..10f82d5643b6 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -29,7 +29,7 @@ * different type underlying the specified range of virtual addresses. * When the function isn't able to map a single page, it returns error. * - * This function takes care of grabbing mmap_sem as necessary. + * This function takes care of grabbing mmap_lock as necessary. */ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, unsigned int gup_flags, struct frame_vector *vec) @@ -48,7 +48,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, start = untagged_addr(start); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); locked = 1; vma = find_vma_intersection(mm, start, start + 1); if (!vma) { @@ -72,7 +72,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(start, nr_frames, + ret = pin_user_pages_locked(start, nr_frames, gup_flags, (struct page **)(vec->ptrs), &locked); goto out; } @@ -102,7 +102,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)); out: if (locked) - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (!ret) ret = -EFAULT; if (ret > 0) @@ -122,7 +122,6 @@ EXPORT_SYMBOL(get_vaddr_frames); */ void put_vaddr_frames(struct frame_vector *vec) { - int i; struct page **pages; if (!vec->got_ref) @@ -135,8 +134,8 @@ void put_vaddr_frames(struct frame_vector *vec) */ if (WARN_ON(IS_ERR(pages))) goto out; - for (i = 0; i < vec->nr_frames; i++) - put_page(pages[i]); + + unpin_user_pages(pages, vec->nr_frames); vec->got_ref = false; out: vec->nr_frames = 0; diff --git a/mm/frontswap.c b/mm/frontswap.c index 60bb20e8a951..bfa3a339253e 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -87,7 +87,7 @@ static inline void inc_frontswap_invalidates(void) { } * * This would not guards us against the user deciding to call swapoff right as * we are calling the backend to initialize (so swapon is in action). - * Fortunatly for us, the swapon_mutex has been taked by the callee so we are + * Fortunately for us, the swapon_mutex has been taken by the callee so we are * OK. The other scenario where calls to frontswap_store (called via * swap_writepage) is racing with frontswap_invalidate_area (called via * swapoff) is again guarded by the swap subsystem. @@ -413,8 +413,8 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } /* - * Used to check if it's necessory and feasible to unuse pages. - * Return 1 when nothing to do, 0 when need to shink pages, + * Used to check if it's necessary and feasible to unuse pages. + * Return 1 when nothing to do, 0 when need to shrink pages, * error code when there is an error. */ static int __frontswap_shrink(unsigned long target_pages, @@ -19,7 +19,6 @@ #include <linux/sched/mm.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include "internal.h" @@ -382,13 +381,22 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, } /* - * FOLL_FORCE can write to even unwritable pte's, but only - * after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE or a forced COW break can write even to unwritable pte's, + * but only after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte)); +} + +/* + * A (separate) COW fault might break the page the other way and + * get_user_pages() would return the page from what is now the wrong + * VM. So we need to force a COW break at GUP time even for reads. + */ +static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags) +{ + return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN)); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -584,7 +592,7 @@ retry: pmdval = READ_ONCE(*pmd); /* * MADV_DONTNEED may convert the pmd to null because - * mmap_sem is held in read mode + * mmap_lock is held in read mode */ if (pmd_none(pmdval)) return no_page_table(vma, flags); @@ -847,8 +855,8 @@ unmap: } /* - * mmap_sem must be held on entry. If @locked != NULL and *@flags - * does not include FOLL_NOWAIT, the mmap_sem may be released. If it + * mmap_lock must be held on entry. If @locked != NULL and *@flags + * does not include FOLL_NOWAIT, the mmap_lock may be released. If it * is, *@locked will be set to 0 and -EBUSY returned. */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, @@ -971,7 +979,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. - * @locked: whether we're still with the mmap_sem held + * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: @@ -980,12 +988,13 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. + * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * - * @vmas are valid only as long as mmap_sem is held. + * @vmas are valid only as long as mmap_lock is held. * - * Must be called with mmap_sem held. It may be released. See below. + * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given @@ -1006,12 +1015,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * appropriate) must be called after the page is finished with, and * before put_page is called. * - * If @locked != NULL, *@locked will be set to 0 when mmap_sem is + * If @locked != NULL, *@locked will be set to 0 when mmap_lock is * released by an up_read(). That can happen if @gup_flags does not * have FOLL_NOWAIT. * * A caller using such a combination of @locked and @gup_flags - * must therefore hold the mmap_sem for reading only, and recognize + * must therefore hold the mmap_lock for reading only, and recognize * when it's been released. Otherwise, it must be held for either * reading or writing and will not be released. * @@ -1066,13 +1075,15 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto out; } if (is_vm_hugetlb_page(vma)) { + if (should_force_cow_break(vma, foll_flags)) + foll_flags |= FOLL_WRITE; i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, - gup_flags, locked); + foll_flags, locked); if (locked && *locked == 0) { /* * We've got a VM_FAULT_RETRY - * and we've lost mmap_sem. + * and we've lost mmap_lock. * We must stop here. */ BUG_ON(gup_flags & FOLL_NOWAIT); @@ -1082,13 +1093,17 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } } + + if (should_force_cow_break(vma, foll_flags)) + foll_flags |= FOLL_WRITE; + retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { - ret = -ERESTARTSYS; + ret = -EINTR; goto out; } cond_resched(); @@ -1168,15 +1183,16 @@ static bool vma_permits_fault(struct vm_area_struct *vma, return true; } -/* +/** * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() - * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller - * does not allow retry + * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller + * does not allow retry. If NULL, the caller must guarantee + * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -1195,8 +1211,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma, * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This function will not return with an unlocked mmap_sem. So it has not the - * same semantics wrt the @mm->mmap_sem as does filemap_fault(). + * This function will not return with an unlocked mmap_lock. So it has not the + * same semantics wrt the @mm->mmap_lock as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, @@ -1218,6 +1234,10 @@ retry: if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; + if ((fault_flags & FAULT_FLAG_KILLABLE) && + fatal_signal_pending(current)) + return -EINTR; + ret = handle_mm_fault(vma, address, fault_flags); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { @@ -1229,12 +1249,10 @@ retry: } if (ret & VM_FAULT_RETRY) { - down_read(&mm->mmap_sem); - if (!(fault_flags & FAULT_FLAG_TRIED)) { - *unlocked = true; - fault_flags |= FAULT_FLAG_TRIED; - goto retry; - } + mmap_read_lock(mm); + *unlocked = true; + fault_flags |= FAULT_FLAG_TRIED; + goto retry; } if (tsk) { @@ -1247,6 +1265,10 @@ retry: } EXPORT_SYMBOL_GPL(fixup_user_fault); +/* + * Please note that this function, unlike __get_user_pages will not + * return 0 for nr_pages > 0 without FOLL_NOWAIT + */ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, @@ -1332,7 +1354,7 @@ retry: break; } - ret = down_read_killable(&mm->mmap_sem); + ret = mmap_read_lock_killable(mm); if (ret) { BUG_ON(ret > 0); if (!pages_done) @@ -1367,7 +1389,7 @@ retry: * We must let the caller know we temporarily dropped the lock * and so the critical section protected by it was lost. */ - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); *locked = 0; } return pages_done; @@ -1378,13 +1400,13 @@ retry: * @vma: target vma * @start: start address * @end: end address - * @locked: whether the mmap_sem is still held + * @locked: whether the mmap_lock is still held * * This takes care of mlocking the pages too if VM_LOCKED is set. * * return 0 on success, negative error code on error. * - * vma->vm_mm->mmap_sem must be held. + * vma->vm_mm->mmap_lock must be held. * * If @locked is NULL, it may be held for read or write and will * be unperturbed. @@ -1403,7 +1425,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + mmap_assert_locked(mm); gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) @@ -1436,7 +1458,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. + * mmap_lock must not be held. */ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { @@ -1455,7 +1477,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) */ if (!locked) { locked = 1; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, nstart); } else if (nstart >= vma->vm_end) vma = vma->vm_next; @@ -1487,7 +1509,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) ret = 0; } if (locked) - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } @@ -1503,7 +1525,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - * allowing a hole to be left in the corefile to save diskspace. * - * Called without mmap_sem, but after all other threads have been killed. + * Called without mmap_lock, but after all other threads have been killed. */ #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) @@ -1837,7 +1859,7 @@ static long __get_user_pages_remote(struct task_struct *tsk, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } -/* +/** * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1864,17 +1886,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, * * The caller is responsible for releasing returned @pages, via put_page(). * - * @vmas are valid only as long as mmap_sem is held. + * @vmas are valid only as long as mmap_lock is held. * - * Must be called with mmap_sem held for read or write. + * Must be called with mmap_lock held for read or write. * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given + * get_user_pages_remote walks a process's page tables and takes a reference + * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different + * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page @@ -1886,17 +1908,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. + * get_user_pages_remote is typically used for fewer-copy IO operations, + * to get a handle on the memory by some means other than accesses + * via the user virtual addresses. The pages may be submitted for + * DMA to devices or accessed via their kernel linear mapping (via the + * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * - * get_user_pages should be phased out in favor of + * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing - * should use get_user_pages because it cannot pass + * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -1935,7 +1957,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, } #endif /* !CONFIG_MMU */ -/* +/** + * get_user_pages() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task * and mm being operated on are the current task's and don't allow @@ -1958,26 +1990,37 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * +/** * get_user_pages_locked() is suitable to replace the form: * - * down_read(&mm->mmap_sem); + * mmap_read_lock(mm); * do_something() * get_user_pages(tsk, mm, ..., pages, NULL); - * up_read(&mm->mmap_sem); + * mmap_read_unlock(mm); * * to: * * int locked = 1; - * down_read(&mm->mmap_sem); + * mmap_read_lock(mm); * do_something() * get_user_pages_locked(tsk, mm, ..., pages, &locked); * if (locked) - * up_read(&mm->mmap_sem); + * mmap_read_unlock(mm); + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -1991,6 +2034,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, @@ -2001,9 +2050,9 @@ EXPORT_SYMBOL(get_user_pages_locked); /* * get_user_pages_unlocked() is suitable to replace the form: * - * down_read(&mm->mmap_sem); + * mmap_read_lock(mm); * get_user_pages(tsk, mm, ..., pages, NULL); - * up_read(&mm->mmap_sem); + * mmap_read_unlock(mm); * * with: * @@ -2029,11 +2078,11 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); if (locked) - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return ret; } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -2147,7 +2196,7 @@ static inline pte_t gup_get_pte(pte_t *ptep) */ static inline pte_t gup_get_pte(pte_t *ptep) { - return READ_ONCE(*ptep); + return ptep_get(ptep); } #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ @@ -2250,7 +2299,7 @@ pte_unmap: * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a - * __get_user_pages_fast implementation that can pin pages. Thus it's still + * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, @@ -2376,7 +2425,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, if (pte_end < end) end = pte_end; - pte = READ_ONCE(*ptep); + pte = huge_ptep_get(ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; @@ -2655,7 +2704,7 @@ static inline void gup_pgd_range(unsigned long addr, unsigned long end, #ifndef gup_fast_permitted /* - * Check if it's allowed to use __get_user_pages_fast() for the range, or + * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) @@ -2664,62 +2713,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -/* - * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to - * the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - * - * If the architecture does not support this function, simply return with no - * pages pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - unsigned long len, end; - unsigned long flags; - int nr_pinned = 0; - /* - * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, - * because gup fast is always a "pin with a +1 page refcount" request. - */ - unsigned int gup_flags = FOLL_GET; - - if (write) - gup_flags |= FOLL_WRITE; - - start = untagged_addr(start) & PAGE_MASK; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (end <= start) - return 0; - if (unlikely(!access_ok((void __user *)start, len))) - return 0; - - /* - * Disable interrupts. We use the nested form as we can already have - * interrupts disabled by get_futex_key. - * - * With interrupts disabled, we block page table pages from being - * freed from under us. See struct mmu_table_batch comments in - * include/asm-generic/tlb.h for more details. - * - * We do not adopt an rcu_read_lock(.) here as we also want to - * block IPIs that come from THPs splitting. - */ - - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_save(flags); - gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); - local_irq_restore(flags); - } - - return nr_pinned; -} -EXPORT_SYMBOL_GPL(__get_user_pages_fast); - static int __gup_longterm_unlocked(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { @@ -2730,11 +2723,11 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages, * get_user_pages_unlocked() (see comments in that function) */ if (gup_flags & FOLL_LONGTERM) { - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); ret = __gup_longterm_locked(current, current->mm, start, nr_pages, pages, NULL, gup_flags); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); } else { ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags); @@ -2748,12 +2741,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, struct page **pages) { unsigned long addr, len, end; + unsigned long flags; int nr_pinned = 0, ret = 0; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | - FOLL_FORCE | FOLL_PIN | FOLL_GET))) + FOLL_FORCE | FOLL_PIN | FOLL_GET | + FOLL_FAST_ONLY))) return -EINVAL; + if (!(gup_flags & FOLL_FAST_ONLY)) + might_lock_read(¤t->mm->mmap_lock); + start = untagged_addr(start) & PAGE_MASK; addr = start; len = (unsigned long) nr_pages << PAGE_SHIFT; @@ -2764,15 +2762,42 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && - gup_fast_permitted(start, end)) { - local_irq_disable(); - gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned); - local_irq_enable(); + /* + * The FAST_GUP case requires FOLL_WRITE even for pure reads, + * because get_user_pages() may need to cause an early COW in + * order to avoid confusing the normal COW routines. So only + * targets that are already writable are safe to do by just + * looking at the page tables. + * + * NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here, + * because there is no slow path to fall back on. But you'd + * better be careful about possible COW pages - you'll get _a_ + * COW page, but not necessarily the one you intended to get + * depending on what COW event happens after this. COW may break + * the page copy in a random direction. + * + * Disable interrupts. The nested form is used, in order to allow + * full, general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being + * freed from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock(.) here as we also want to + * block IPIs that come from THPs splitting. + */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { + unsigned long fast_flags = gup_flags; + if (!(gup_flags & FOLL_FAST_ONLY)) + fast_flags |= FOLL_WRITE; + + local_irq_save(flags); + gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); + local_irq_restore(flags); ret = nr_pinned; } - if (nr_pinned < nr_pages) { + if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) { /* Try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; @@ -2791,6 +2816,54 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } +/** + * get_user_pages_fast_only() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. + * + * If the architecture does not support this function, simply return with no + * pages pinned. + * + * Careful, careful! COW breaking can go either way, so a non-write + * access can get ambiguous page results. If you call this function without + * 'write' set, you'd better be sure that you're ok with that ambiguity. + */ +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int nr_pinned; + /* + * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, + * because gup fast is always a "pin with a +1 page refcount" request. + * + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + gup_flags |= FOLL_GET | FOLL_FAST_ONLY; + + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + + /* + * As specified in the API description above, this routine is not + * allowed to return negative values. However, the common core + * routine internal_get_user_pages_fast() *can* return -errno. + * Therefore, correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory @@ -2800,7 +2873,7 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * - * Attempt to pin user pages in memory without taking mm->mmap_sem. + * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * @@ -2843,10 +2916,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for further details. - * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). + * see Documentation/core-api/pin_user_pages.rst for further details. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -2860,6 +2930,42 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, } EXPORT_SYMBOL_GPL(pin_user_pages_fast); +/* + * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior + * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. + * + * The API rules are the same, too: no negative values may be returned. + */ +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int nr_pinned; + + /* + * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API + * rules require returning 0, rather than -errno: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return 0; + /* + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + /* + * This routine is not allowed to return negative values. However, + * internal_get_user_pages_fast() *can* return -errno. Therefore, + * correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); + /** * pin_user_pages_remote() - pin pages of a remote process (task != current) * @@ -2883,10 +2989,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). + * see Documentation/core-api/pin_user_pages.rst for details. */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -2919,10 +3022,7 @@ EXPORT_SYMBOL(pin_user_pages_remote); * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please - * see Documentation/vm/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). + * see Documentation/core-api/pin_user_pages.rst for details. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -2937,3 +3037,49 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); + +/* + * pin_user_pages_unlocked() is the FOLL_PIN variant of + * get_user_pages_unlocked(). Behavior is the same, except that this one sets + * FOLL_PIN and rejects FOLL_GET. + */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages_unlocked); + +/* + * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). + * Behavior is the same, except that this one sets FOLL_PIN and rejects + * FOLL_GET. + */ +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) +{ + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(pin_user_pages_locked); @@ -37,28 +37,13 @@ enum { HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, }; -/* - * hmm_device_entry_from_pfn() - create a valid device entry value from pfn - * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the device entry - * Return: valid device entry for the pfn - */ -static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, - unsigned long pfn) -{ - return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; -} - static int hmm_pfns_fill(unsigned long addr, unsigned long end, - struct hmm_range *range, enum hmm_pfn_value_e value) + struct hmm_range *range, unsigned long cpu_flags) { - uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i = (addr - range->start) >> PAGE_SHIFT; - i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = range->values[value]; - + range->hmm_pfns[i] = cpu_flags; return 0; } @@ -96,7 +81,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end, } static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - uint64_t pfns, uint64_t cpu_flags) + unsigned long pfn_req_flags, + unsigned long cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; @@ -110,27 +96,28 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, * waste to have the user pre-fill the pfn arrays with a default * flags value. */ - pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + pfn_req_flags &= range->pfn_flags_mask; + pfn_req_flags |= range->default_flags; /* We aren't ask to do anything ... */ - if (!(pfns & range->flags[HMM_PFN_VALID])) + if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) return 0; /* Need to write fault ? */ - if ((pfns & range->flags[HMM_PFN_WRITE]) && - !(cpu_flags & range->flags[HMM_PFN_WRITE])) + if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && + !(cpu_flags & HMM_PFN_WRITE)) return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; /* If CPU page table is not valid then we need to fault */ - if (!(cpu_flags & range->flags[HMM_PFN_VALID])) + if (!(cpu_flags & HMM_PFN_VALID)) return HMM_NEED_FAULT; return 0; } static unsigned int hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - const uint64_t *pfns, unsigned long npages, - uint64_t cpu_flags) + const unsigned long hmm_pfns[], unsigned long npages, + unsigned long cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault = 0; @@ -142,12 +129,12 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, * hmm_pte_need_fault() will always return 0. */ if (!((range->default_flags | range->pfn_flags_mask) & - range->flags[HMM_PFN_VALID])) + HMM_PFN_REQ_FAULT)) return 0; for (i = 0; i < npages; ++i) { - required_fault |= - hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags); + required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], + cpu_flags); if (required_fault == HMM_NEED_ALL_BITS) return required_fault; } @@ -161,12 +148,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; unsigned long i, npages; - uint64_t *pfns; + unsigned long *hmm_pfns; i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; - required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0); + hmm_pfns = &range->hmm_pfns[i]; + required_fault = + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); if (!walk->vma) { if (required_fault) return -EFAULT; @@ -174,46 +162,44 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, } if (required_fault) return hmm_vma_fault(addr, end, required_fault, walk); - hmm_vma_walk->last = addr; - return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); + return hmm_pfns_fill(addr, end, range, 0); } -static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) +static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, + pmd_t pmd) { if (pmd_protnone(pmd)) return 0; - return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, - unsigned long end, uint64_t *pfns, pmd_t pmd) + unsigned long end, unsigned long hmm_pfns[], + pmd_t pmd) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; unsigned int required_fault; - uint64_t cpu_flags; + unsigned long cpu_flags; npages = (end - addr) >> PAGE_SHIFT; cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); required_fault = - hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags); + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); if (required_fault) return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - hmm_vma_walk->last = end; + hmm_pfns[i] = pfn | cpu_flags; return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ /* stub to allow the code below to compile */ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, - unsigned long end, uint64_t *pfns, pmd_t pmd); + unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool hmm_is_device_private_entry(struct hmm_range *range, @@ -224,31 +210,31 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range, range->dev_private_owner; } -static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) +static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, + pte_t pte) { if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) return 0; - return pte_write(pte) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long end, pmd_t *pmdp, pte_t *ptep, - uint64_t *pfn) + unsigned long *hmm_pfn) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned int required_fault; - uint64_t cpu_flags; + unsigned long cpu_flags; pte_t pte = *ptep; - uint64_t orig_pfn = *pfn; + uint64_t pfn_req_flags = *hmm_pfn; if (pte_none(pte)) { - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *pfn = range->values[HMM_PFN_NONE]; + *hmm_pfn = 0; return 0; } @@ -260,17 +246,18 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * the PFN even if not present. */ if (hmm_is_device_private_entry(range, entry)) { - *pfn = hmm_device_entry_from_pfn(range, - device_private_entry_to_pfn(entry)); - *pfn |= range->flags[HMM_PFN_VALID]; + cpu_flags = HMM_PFN_VALID; if (is_write_device_private_entry(entry)) - *pfn |= range->flags[HMM_PFN_WRITE]; + cpu_flags |= HMM_PFN_WRITE; + *hmm_pfn = device_private_entry_to_pfn(entry) | + cpu_flags; return 0; } - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (!required_fault) { - *pfn = range->values[HMM_PFN_NONE]; + *hmm_pfn = 0; return 0; } @@ -290,7 +277,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, } cpu_flags = pte_to_hmm_pfn_flags(range, pte); - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) goto fault; @@ -299,15 +287,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * fall through and treat it like a normal page. */ if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { - if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) { + if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); return -EFAULT; } - *pfn = range->values[HMM_PFN_SPECIAL]; + *hmm_pfn = HMM_PFN_ERROR; return 0; } - *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; + *hmm_pfn = pte_pfn(pte) | cpu_flags; return 0; fault: @@ -323,7 +311,8 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; + unsigned long *hmm_pfns = + &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; unsigned long npages = (end - start) >> PAGE_SHIFT; unsigned long addr = start; pte_t *ptep; @@ -335,16 +324,16 @@ again: return hmm_vma_walk_hole(start, end, -1, walk); if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) { + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } - return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + return hmm_pfns_fill(start, end, range, 0); } if (!pmd_present(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } @@ -364,7 +353,7 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); + return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); } /* @@ -374,37 +363,33 @@ again: * recover. */ if (pmd_bad(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); } ptep = pte_offset_map(pmdp, addr); - for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { + for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { int r; - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); if (r) { /* hmm_vma_handle_pte() did pte_unmap() */ - hmm_vma_walk->last = addr; return r; } } pte_unmap(ptep - 1); - - hmm_vma_walk->last = addr; return 0; } #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) -static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, + pud_t pud) { if (!pud_present(pud)) return 0; - return pud_write(pud) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; + return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; } static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, @@ -432,7 +417,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, if (pud_huge(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; unsigned int required_fault; - uint64_t *pfns, cpu_flags; + unsigned long *hmm_pfns; + unsigned long cpu_flags; if (!pud_present(pud)) { spin_unlock(ptl); @@ -441,10 +427,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; + hmm_pfns = &range->hmm_pfns[i]; cpu_flags = pud_to_hmm_pfn_flags(range, pud); - required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, + required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); if (required_fault) { spin_unlock(ptl); @@ -453,9 +439,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); for (i = 0; i < npages; ++i, ++pfn) - pfns[i] = hmm_device_entry_from_pfn(range, pfn) | - cpu_flags; - hmm_vma_walk->last = end; + hmm_pfns[i] = pfn | cpu_flags; goto out_unlock; } @@ -479,8 +463,9 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - uint64_t orig_pfn, cpu_flags; unsigned int required_fault; + unsigned long pfn_req_flags; + unsigned long cpu_flags; spinlock_t *ptl; pte_t entry; @@ -488,9 +473,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, entry = huge_ptep_get(pte); i = (start - range->start) >> PAGE_SHIFT; - orig_pfn = range->pfns[i]; + pfn_req_flags = range->hmm_pfns[i]; cpu_flags = pte_to_hmm_pfn_flags(range, entry); - required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); if (required_fault) { spin_unlock(ptl); return hmm_vma_fault(addr, end, required_fault, walk); @@ -498,9 +484,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | - cpu_flags; - hmm_vma_walk->last = end; + range->hmm_pfns[i] = pfn | cpu_flags; + spin_unlock(ptl); return 0; } @@ -531,13 +516,12 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, * failure. */ if (hmm_range_need_fault(hmm_vma_walk, - range->pfns + + range->hmm_pfns + ((start - range->start) >> PAGE_SHIFT), (end - start) >> PAGE_SHIFT, 0)) return -EFAULT; hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); - hmm_vma_walk->last = end; /* Skip this vma and continue processing the next vma. */ return 1; @@ -555,9 +539,7 @@ static const struct mm_walk_ops hmm_walk_ops = { * hmm_range_fault - try to fault some address in a virtual address range * @range: argument structure * - * Return: the number of valid pages in range->pfns[] (from range start - * address), which may be zero. On error one of the following status codes - * can be returned: + * Returns 0 on success or one of the following error codes: * * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma * (e.g., device file vma). @@ -572,7 +554,7 @@ static const struct mm_walk_ops hmm_walk_ops = { * This is similar to get_user_pages(), except that it can read the page tables * without mutating them (ie causing faults). */ -long hmm_range_fault(struct hmm_range *range) +int hmm_range_fault(struct hmm_range *range) { struct hmm_vma_walk hmm_vma_walk = { .range = range, @@ -581,7 +563,7 @@ long hmm_range_fault(struct hmm_range *range) struct mm_struct *mm = range->notifier->mm; int ret; - lockdep_assert_held(&mm->mmap_sem); + mmap_assert_locked(mm); do { /* If range is no longer valid force retry. */ @@ -590,10 +572,13 @@ long hmm_range_fault(struct hmm_range *range) return -EBUSY; ret = walk_page_range(mm, hmm_vma_walk.last, range->end, &hmm_walk_ops, &hmm_vma_walk); + /* + * When -EBUSY is returned the loop restarts with + * hmm_vma_walk.last set to an address that has not been stored + * in pfns. All entries < last in the pfn array are set to their + * output, and all >= are still at their input values. + */ } while (ret == -EBUSY); - - if (ret) - return ret; - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + return ret; } EXPORT_SYMBOL(hmm_range_fault); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6ecd1045113b..78c84bee7e29 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -522,7 +522,7 @@ void prep_transhuge_page(struct page *page) bool is_transparent_hugepage(struct page *page) { if (!PageCompound(page)) - return 0; + return false; page = compound_head(page); return is_huge_zero_page(page) || @@ -587,19 +587,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_charge(page, vma->vm_mm, gfp)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } + cgroup_throttle_swaprate(page, gfp); pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { @@ -630,7 +630,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, vm_fault_t ret2; spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); ret2 = handle_userfault(vmf, VM_UFFD_MISSING); @@ -641,7 +640,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); @@ -649,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -658,7 +656,6 @@ unlock_release: release: if (pgtable) pte_free(vma->vm_mm, pgtable); - mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return ret; @@ -1255,273 +1252,72 @@ unlock: spin_unlock(vmf->ptl); } -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, - pmd_t orig_pmd, struct page *page) -{ - struct vm_area_struct *vma = vmf->vma; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mem_cgroup *memcg; - pgtable_t pgtable; - pmd_t _pmd; - int i; - vm_fault_t ret = 0; - struct page **pages; - struct mmu_notifier_range range; - - pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *), - GFP_KERNEL); - if (unlikely(!pages)) { - ret |= VM_FAULT_OOM; - goto out; - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, - vmf->address, page_to_nid(page)); - if (unlikely(!pages[i] || - mem_cgroup_try_charge_delay(pages[i], vma->vm_mm, - GFP_KERNEL, &memcg, false))) { - if (pages[i]) - put_page(pages[i]); - while (--i >= 0) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, - false); - put_page(pages[i]); - } - kfree(pages); - ret |= VM_FAULT_OOM; - goto out; - } - set_page_private(pages[i], (unsigned long)memcg); - } - - for (i = 0; i < HPAGE_PMD_NR; i++) { - copy_user_highpage(pages[i], page + i, - haddr + PAGE_SIZE * i, vma); - __SetPageUptodate(pages[i]); - cond_resched(); - } - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - haddr, haddr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_free_pages; - VM_BUG_ON_PAGE(!PageHead(page), page); - - /* - * Leave pmd empty until pte is filled note we must notify here as - * concurrent CPU thread might write to new page before the call to - * mmu_notifier_invalidate_range_end() happens which can lead to a - * device seeing memory write in different order than CPU. - * - * See Documentation/vm/mmu_notifier.rst - */ - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); - pmd_populate(vma->vm_mm, &_pmd, pgtable); - - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t entry; - entry = mk_pte(pages[i], vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); - mem_cgroup_commit_charge(pages[i], memcg, false, false); - lru_cache_add_active_or_unevictable(pages[i], vma); - vmf->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*vmf->pte)); - set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); - pte_unmap(vmf->pte); - } - kfree(pages); - - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, vmf->pmd, pgtable); - page_remove_rmap(page, true); - spin_unlock(vmf->ptl); - - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); - - ret |= VM_FAULT_WRITE; - put_page(page); - -out: - return ret; - -out_free_pages: - spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(&range); - for (i = 0; i < HPAGE_PMD_NR; i++) { - memcg = (void *)page_private(pages[i]); - set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg, false); - put_page(pages[i]); - } - kfree(pages); - goto out; -} - vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *new_page; - struct mem_cgroup *memcg; + struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - struct mmu_notifier_range range; - gfp_t huge_gfp; /* for allocation and charge */ - vm_fault_t ret = 0; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); + if (is_huge_zero_pmd(orig_pmd)) - goto alloc; + goto fallback; + spin_lock(vmf->ptl); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) - goto out_unlock; + + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); + return 0; + } page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); - /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. - */ + + /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); lock_page(page); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); unlock_page(page); put_page(page); - goto out_unlock; + return 0; } put_page(page); } + + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. + */ if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - ret |= VM_FAULT_WRITE; unlock_page(page); - goto out_unlock; - } - unlock_page(page); - get_page(page); - spin_unlock(vmf->ptl); -alloc: - if (__transparent_hugepage_enabled(vma) && - !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_direct_gfpmask(vma); - new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); - } else - new_page = NULL; - - if (likely(new_page)) { - prep_transhuge_page(new_page); - } else { - if (!page) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } else { - ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); - if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, vmf->pmd, vmf->address); - ret |= VM_FAULT_FALLBACK; - } - put_page(page); - } - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - - if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm, - huge_gfp, &memcg, true))) { - put_page(new_page); - split_huge_pmd(vma, vmf->pmd, vmf->address); - if (page) - put_page(page); - ret |= VM_FAULT_FALLBACK; - count_vm_event(THP_FAULT_FALLBACK); - count_vm_event(THP_FAULT_FALLBACK_CHARGE); - goto out; - } - - count_vm_event(THP_FAULT_ALLOC); - count_memcg_events(memcg, THP_FAULT_ALLOC, 1); - - if (!page) - clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); - else - copy_user_huge_page(new_page, page, vmf->address, - vma, HPAGE_PMD_NR); - __SetPageUptodate(new_page); - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - haddr, haddr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - - spin_lock(vmf->ptl); - if (page) - put_page(page); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - mem_cgroup_cancel_charge(new_page, memcg, true); - put_page(new_page); - goto out_mn; - } else { - pmd_t entry; - entry = mk_huge_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - page_add_new_anon_rmap(new_page, vma, haddr, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - if (!page) { - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - } else { - VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page, true); - put_page(page); - } - ret |= VM_FAULT_WRITE; + return VM_FAULT_WRITE; } + + unlock_page(page); spin_unlock(vmf->ptl); -out_mn: - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pmdp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); -out: - return ret; -out_unlock: - spin_unlock(vmf->ptl); - return ret; +fallback: + __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + return VM_FAULT_FALLBACK; } /* - * FOLL_FORCE can write to even unwritable pmd's, but only - * after we've gone through a COW cycle and they are dirty. + * FOLL_FORCE or a forced COW break can write even to unwritable pmd's, + * but only after we've gone through a COW cycle and they are dirty. */ static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) { - return pmd_write(pmd) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); + return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd)); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, @@ -1582,7 +1378,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto skip_mlock; if (!trylock_page(page)) goto skip_mlock; - lru_add_drain(); if (page->mapping && !PageDoubleMap(page)) mlock_vma_page(page); unlock_page(page); @@ -1852,8 +1647,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); + orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, + tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) @@ -1951,7 +1746,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, /* * We don't have to worry about the ordering of src and dst - * ptlocks because exclusive mmap_sem prevents deadlock. + * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = __pmd_trans_huge_lock(old_pmd, vma); if (old_ptl) { @@ -2038,9 +1833,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, goto unlock; /* - * In case prot_numa, we are under down_read(mmap_sem). It's critical + * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED - * which is also under down_read(mmap_sem): + * which is also under mmap_read_lock(mm): * * CPU0: CPU1: * change_huge_pmd(prot_numa=1) @@ -2360,15 +2155,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, atomic_inc(&page[i]._mapcount); } + lock_page_memcg(page); if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) atomic_dec(&page[i]._mapcount); } } + unlock_page_memcg(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); @@ -2386,6 +2183,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mmu_notifier_range range; + bool was_locked = false; + pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, @@ -2398,11 +2197,32 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * pmd against. Otherwise we can end up replacing wrong page. */ VM_BUG_ON(freeze && !page); - if (page && page != pmd_page(*pmd)) - goto out; + if (page) { + VM_WARN_ON_ONCE(!PageLocked(page)); + was_locked = true; + if (page != pmd_page(*pmd)) + goto out; + } +repeat: if (pmd_trans_huge(*pmd)) { - page = pmd_page(*pmd); + if (!page) { + page = pmd_page(*pmd); + if (unlikely(!trylock_page(page))) { + get_page(page); + _pmd = *pmd; + spin_unlock(ptl); + lock_page(page); + spin_lock(ptl); + if (unlikely(!pmd_same(*pmd, _pmd))) { + unlock_page(page); + put_page(page); + page = NULL; + goto repeat; + } + put_page(page); + } + } if (PageMlocked(page)) clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) @@ -2410,6 +2230,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); + if (!was_locked && page) + unlock_page(page); /* * No need to double call mmu_notifier->invalidate_range() callback. * They are 3 cases to consider inside __split_huge_pmd_locked(): @@ -2784,7 +2606,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; - bool mlocked; unsigned long flags; pgoff_t end; @@ -2797,7 +2618,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (PageAnon(head)) { /* - * The caller does not necessarily hold an mmap_sem that would + * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a * reference to it and then lock the anon_vma for write. This * is similar to page_lock_anon_vma_read except the write lock @@ -2843,14 +2664,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - mlocked = PageMlocked(head); unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); - /* Make sure the page is not on per-CPU pagevec as it takes pin */ - if (mlocked) - lru_add_drain(); - /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irqsave(&pgdata->lru_lock, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd459155d28a..57ece74e3aae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,7 +31,6 @@ #include <linux/cma.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/tlb.h> #include <linux/io.h> @@ -59,8 +58,8 @@ __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; -static unsigned long __initdata default_hstate_size; static bool __initdata parsed_valid_hugepagesz = true; +static bool __initdata parsed_default_hugepagesz; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -85,7 +84,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, give up any reservations mased on minimum size and + * remain, give up any reservations based on minimum size and * free the subpool */ if (free) { if (spool->min_hpages != -1) @@ -133,7 +132,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * the request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where - * a subpool minimum size must be manitained. + * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) @@ -473,7 +472,7 @@ out_of_memory: * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for - * this operation and we were not able to allocate, it ruturns -ENOMEM. + * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry. @@ -988,7 +987,7 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) * We know VM_NORESERVE is not set. Therefore, there SHOULD * be a region map for all pages. The only situation where * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reverves to + * fallocate. In this case, there really are no reserves to * use. This situation is indicated if chg != 0. */ if (chg) @@ -1519,7 +1518,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwse drivers using get_user_pages() to access tail + * too. Otherwise drivers using get_user_pages() to access tail * pages may get the reference counting wrong if they see * PG_reserved set on a tail page (despite the head page not * having PG_reserved set). Enforcing this consistency between @@ -3060,7 +3059,7 @@ static void __init hugetlb_sysfs_init(void) err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) - pr_err("Hugetlb: Unable to add hstate %s", h->name); + pr_err("HugeTLB: Unable to add hstate %s", h->name); } } @@ -3164,7 +3163,7 @@ static void hugetlb_register_node(struct node *node) nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { - pr_err("Hugetlb: Unable to add hstate %s for node %d\n", + pr_err("HugeTLB: Unable to add hstate %s for node %d\n", h->name, node->dev.id); hugetlb_unregister_node(node); break; @@ -3212,23 +3211,41 @@ static int __init hugetlb_init(void) { int i; - if (!hugepages_supported()) + if (!hugepages_supported()) { + if (hugetlb_max_hstate || default_hstate_max_huge_pages) + pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); return 0; + } - if (!size_to_hstate(default_hstate_size)) { - if (default_hstate_size != 0) { - pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", - default_hstate_size, HPAGE_SIZE); + /* + * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some + * architectures depend on setup being done here. + */ + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + if (!parsed_default_hugepagesz) { + /* + * If we did not parse a default huge page size, set + * default_hstate_idx to HPAGE_SIZE hstate. And, if the + * number of huge pages for this default size was implicitly + * specified, set that here as well. + * Note that the implicit setting will overwrite an explicit + * setting. A warning will be printed in this case. + */ + default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); + if (default_hstate_max_huge_pages) { + if (default_hstate.max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(&default_hstate), + 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", + default_hstate.max_huge_pages, buf); + pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", + default_hstate_max_huge_pages); + } + default_hstate.max_huge_pages = + default_hstate_max_huge_pages; } - - default_hstate_size = HPAGE_SIZE; - if (!size_to_hstate(default_hstate_size)) - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - } - default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); - if (default_hstate_max_huge_pages) { - if (!default_hstate.max_huge_pages) - default_hstate.max_huge_pages = default_hstate_max_huge_pages; } hugetlb_cma_check(); @@ -3256,10 +3273,10 @@ static int __init hugetlb_init(void) } subsys_initcall(hugetlb_init); -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_bad_size(void) +/* Overwritten by architectures with more huge page sizes */ +bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) { - parsed_valid_hugepagesz = false; + return size == HPAGE_SIZE; } void __init hugetlb_add_hstate(unsigned int order) @@ -3268,7 +3285,6 @@ void __init hugetlb_add_hstate(unsigned int order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - pr_warn("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -3289,20 +3305,29 @@ void __init hugetlb_add_hstate(unsigned int order) parsed_hstate = h; } -static int __init hugetlb_nrpages_setup(char *s) +/* + * hugepages command line processing + * hugepages normally follows a valid hugepagsz or default_hugepagsz + * specification. If not, ignore the hugepages value. hugepages can also + * be the first huge page command line option in which case it implicitly + * specifies the number of huge pages for the default size. + */ +static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; if (!parsed_valid_hugepagesz) { - pr_warn("hugepages = %s preceded by " - "an unsupported hugepagesz, ignoring\n", s); + pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; - return 1; + return 0; } + /* - * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, - * so this hugepages= parameter goes to the "default hstate". + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter + * yet, so this hugepages= parameter goes to the "default hstate". + * Otherwise, it goes with the previously parsed hugepagesz or + * default_hugepagesz. */ else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; @@ -3310,8 +3335,8 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); - return 1; + pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); + return 0; } if (sscanf(s, "%lu", mhp) <= 0) @@ -3329,14 +3354,102 @@ static int __init hugetlb_nrpages_setup(char *s) return 1; } -__setup("hugepages=", hugetlb_nrpages_setup); +__setup("hugepages=", hugepages_setup); + +/* + * hugepagesz command line processing + * A specific huge page size can only be specified once with hugepagesz. + * hugepagesz is followed by hugepages on the command line. The global + * variable 'parsed_valid_hugepagesz' is used to determine if prior + * hugepagesz argument was valid. + */ +static int __init hugepagesz_setup(char *s) +{ + unsigned long size; + struct hstate *h; + + parsed_valid_hugepagesz = false; + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); + return 0; + } + + h = size_to_hstate(size); + if (h) { + /* + * hstate for this size already exists. This is normally + * an error, but is allowed if the existing hstate is the + * default hstate. More specifically, it is only allowed if + * the number of huge pages for the default hstate was not + * previously specified. + */ + if (!parsed_default_hugepagesz || h != &default_hstate || + default_hstate.max_huge_pages) { + pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); + return 0; + } + + /* + * No need to call hugetlb_add_hstate() as hstate already + * exists. But, do set parsed_hstate so that a following + * hugepages= parameter will be applied to this hstate. + */ + parsed_hstate = h; + parsed_valid_hugepagesz = true; + return 1; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; + return 1; +} +__setup("hugepagesz=", hugepagesz_setup); -static int __init hugetlb_default_setup(char *s) +/* + * default_hugepagesz command line input + * Only one instance of default_hugepagesz allowed on command line. + */ +static int __init default_hugepagesz_setup(char *s) { - default_hstate_size = memparse(s, &s); + unsigned long size; + + parsed_valid_hugepagesz = false; + if (parsed_default_hugepagesz) { + pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); + return 0; + } + + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); + return 0; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; + parsed_default_hugepagesz = true; + default_hstate_idx = hstate_index(size_to_hstate(size)); + + /* + * The number of default huge pages (for this size) could have been + * specified as the first hugetlb parameter: hugepages=X. If so, + * then default_hstate_max_huge_pages is set. If the default huge + * page size is gigantic (>= MAX_ORDER), then the pages must be + * allocated here from bootmem allocator. + */ + if (default_hstate_max_huge_pages) { + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + if (hstate_is_gigantic(&default_hstate)) + hugetlb_hstate_alloc_pages(&default_hstate); + default_hstate_max_huge_pages = 0; + } + return 1; } -__setup("default_hugepagesz=", hugetlb_default_setup); +__setup("default_hugepagesz=", default_hugepagesz_setup); static unsigned int cpuset_mems_nr(unsigned int *array) { @@ -3352,7 +3465,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; @@ -3375,7 +3488,7 @@ out: } int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, @@ -3384,7 +3497,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, #ifdef CONFIG_NUMA int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); @@ -3392,8 +3505,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, #endif /* CONFIG_NUMA */ int hugetlb_overcommit_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; @@ -4466,9 +4578,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * entry could be a migration/hwpoison entry at this point, so this * check prevents the kernel from going below assuming that we have - * a active hugepage in pagecache. This goto expects the 2nd page fault, - * and is_hugetlb_entry_(migration|hwpoisoned) check will properly - * handle it. + * an active hugepage in pagecache. This goto expects the 2nd page + * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will + * properly handle it. */ if (!pte_present(entry)) goto out_mutex; @@ -4583,7 +4695,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, (const void __user *) src_addr, pages_per_huge_page(h), false); - /* fallback to copy_from_user outside mmap_sem */ + /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; *pagep = page; @@ -5355,8 +5467,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, * huge_pte_offset() - Walk the page table to resolve the hugepage * entry at address @addr * - * Return: Pointer to page table or swap entry (PUD or PMD) for - * address @addr, or NULL if a p*d_none() entry is encountered and the + * Return: Pointer to page table entry (PUD or PMD) for + * address @addr, or NULL if a !p*d_present() entry is encountered and the * size @sz doesn't match the hugepage size at this level of the page * table. */ @@ -5376,20 +5488,16 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; pud = pud_offset(p4d, addr); - if (sz != PUD_SIZE && pud_none(*pud)) - return NULL; - /* hugepage or swap? */ - if (pud_huge(*pud) || !pud_present(*pud)) + if (sz == PUD_SIZE) + /* must be pud huge, non-present or none */ return (pte_t *)pud; - - pmd = pmd_offset(pud, addr); - if (sz != PMD_SIZE && pmd_none(*pmd)) + if (!pud_present(*pud)) return NULL; - /* hugepage or swap? */ - if (pmd_huge(*pmd) || !pmd_present(*pmd)) - return (pte_t *)pmd; + /* must have a valid entry and size to go further */ - return NULL; + pmd = pmd_offset(pud, addr); + /* must be pmd huge, non-present or none */ + return (pte_t *)pmd; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ diff --git a/mm/init-mm.c b/mm/init-mm.c index 19603302a77f..3a613c85f9ed 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -6,10 +6,10 @@ #include <linux/list.h> #include <linux/cpumask.h> #include <linux/mman.h> +#include <linux/pgtable.h> #include <linux/atomic.h> #include <linux/user_namespace.h> -#include <asm/pgtable.h> #include <asm/mmu.h> #ifndef INIT_MM_CONTEXT @@ -31,7 +31,7 @@ struct mm_struct init_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), diff --git a/mm/internal.h b/mm/internal.h index b5634e78f01d..9886db20d94f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,18 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -extern unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void force_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read); +void __do_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size); /* * Submit IO for the read-ahead request in file_ra_state. */ -static inline unsigned long ra_submit(struct file_ra_state *ra, +static inline void ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { - return __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); + __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); } /** @@ -125,12 +127,12 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); * between functions involved in allocations, including the alloc_pages* * family of functions. * - * nodemask, migratetype and high_zoneidx are initialized only once in + * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages_nodemask() and then never change. * - * zonelist, preferred_zone and classzone_idx are set first in + * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages_nodemask() for the fast path, and might be later changed - * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { @@ -138,12 +140,21 @@ struct alloc_context { nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; - enum zone_type high_zoneidx; + + /* + * highest_zoneidx represents highest usable zone index of + * the allocation request. Due to the nature of the zone, + * memory on lower zone than the highest_zoneidx will be + * protected by lowmem_reserve[highest_zoneidx]. + * + * highest_zoneidx is also used by reclaim/compaction to limit + * the target zone since higher zone than this index cannot be + * usable for this allocation request. + */ + enum zone_type highest_zoneidx; bool spread_dirty_pages; }; -#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref) - /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). @@ -222,7 +233,7 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ + const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ @@ -333,7 +344,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * must be called with vma's mmap_sem held for read or write, and page locked. + * must be called with vma's mmap_lock held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); extern unsigned int munlock_vma_page(struct page *page); @@ -402,13 +413,13 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or - * anything, so we only pin the file and drop the mmap_sem if only + * anything, so we only pin the file and drop the mmap_lock if only * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); - up_read(&vmf->vma->vm_mm->mmap_sem); + mmap_read_unlock(vmf->vma->vm_mm); } return fpin; } @@ -527,7 +538,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 08b43de2383b..d532c2587731 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,23 +1,33 @@ # SPDX-License-Identifier: GPL-2.0 KASAN_SANITIZE := n -UBSAN_SANITIZE_common.o := n -UBSAN_SANITIZE_generic.o := n -UBSAN_SANITIZE_generic_report.o := n -UBSAN_SANITIZE_tags.o := n +UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +# Disable ftrace to avoid recursion. CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE) # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 +CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack) +CC_FLAGS_KASAN_RUNTIME += $(call cc-option, -fno-stack-protector) +# Disable branch tracing to avoid recursion. +CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING -CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME) obj-$(CONFIG_KASAN) := common.o init.o report.o obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2906358e42f0..757d4074fe28 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -33,7 +33,6 @@ #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/bug.h> -#include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -613,24 +612,6 @@ void kasan_free_shadow(const struct vm_struct *vm) } #endif -extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -extern bool report_enabled(void); - -bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) -{ - unsigned long flags = user_access_save(); - bool ret = false; - - if (likely(report_enabled())) { - __kasan_report(addr, size, is_write, ip); - ret = true; - } - - user_access_restore(flags); - - return ret; -} - #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) { diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 56ff8885fe2e..098a7dbaced6 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -15,7 +15,6 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define DISABLE_BRANCH_PROFILING #include <linux/export.h> #include <linux/interrupt.h> diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ce45c491ebcd..fe6be0be1f76 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -250,20 +250,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, * 3,2 - level page tables where we don't have * puds,pmds, so pgd_populate(), pud_populate() * is noops. - * - * The ifndef is required to avoid build breakage. - * - * With 5level-fixup.h, pgd_populate() is not nop and - * we reference kasan_early_shadow_p4d. It's not defined - * unless 5-level paging enabled. - * - * The ifndef can be dropped once all KASAN-enabled - * architectures will switch to pgtable-nop4d.h. */ -#ifndef __ARCH_HAS_5LEVEL_HACK pgd_populate(&init_mm, pgd, lm_alias(kasan_early_shadow_p4d)); -#endif p4d = p4d_offset(pgd, addr); p4d_populate(&init_mm, p4d, lm_alias(kasan_early_shadow_pud)); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index e8f37199d885..cfade6413528 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -212,8 +212,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); void __asan_register_globals(struct kasan_global *globals, size_t size); void __asan_unregister_globals(struct kasan_global *globals, size_t size); -void __asan_loadN(unsigned long addr, size_t size); -void __asan_storeN(unsigned long addr, size_t size); void __asan_handle_no_return(void); void __asan_alloca_poison(unsigned long addr, size_t size); void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); @@ -228,6 +226,8 @@ void __asan_load8(unsigned long addr); void __asan_store8(unsigned long addr); void __asan_load16(unsigned long addr); void __asan_store16(unsigned long addr); +void __asan_loadN(unsigned long addr, size_t size); +void __asan_storeN(unsigned long addr, size_t size); void __asan_load1_noabort(unsigned long addr); void __asan_store1_noabort(unsigned long addr); @@ -239,6 +239,21 @@ void __asan_load8_noabort(unsigned long addr); void __asan_store8_noabort(unsigned long addr); void __asan_load16_noabort(unsigned long addr); void __asan_store16_noabort(unsigned long addr); +void __asan_loadN_noabort(unsigned long addr, size_t size); +void __asan_storeN_noabort(unsigned long addr, size_t size); + +void __asan_report_load1_noabort(unsigned long addr); +void __asan_report_store1_noabort(unsigned long addr); +void __asan_report_load2_noabort(unsigned long addr); +void __asan_report_store2_noabort(unsigned long addr); +void __asan_report_load4_noabort(unsigned long addr); +void __asan_report_store4_noabort(unsigned long addr); +void __asan_report_load8_noabort(unsigned long addr); +void __asan_report_store8_noabort(unsigned long addr); +void __asan_report_load16_noabort(unsigned long addr); +void __asan_report_store16_noabort(unsigned long addr); +void __asan_report_load_n_noabort(unsigned long addr, size_t size); +void __asan_report_store_n_noabort(unsigned long addr, size_t size); void __asan_set_shadow_00(const void *addr, size_t size); void __asan_set_shadow_f1(const void *addr, size_t size); @@ -247,4 +262,19 @@ void __asan_set_shadow_f3(const void *addr, size_t size); void __asan_set_shadow_f5(const void *addr, size_t size); void __asan_set_shadow_f8(const void *addr, size_t size); +void __hwasan_load1_noabort(unsigned long addr); +void __hwasan_store1_noabort(unsigned long addr); +void __hwasan_load2_noabort(unsigned long addr); +void __hwasan_store2_noabort(unsigned long addr); +void __hwasan_load4_noabort(unsigned long addr); +void __hwasan_store4_noabort(unsigned long addr); +void __hwasan_load8_noabort(unsigned long addr); +void __hwasan_store8_noabort(unsigned long addr); +void __hwasan_load16_noabort(unsigned long addr); +void __hwasan_store16_noabort(unsigned long addr); +void __hwasan_loadN_noabort(unsigned long addr, size_t size); +void __hwasan_storeN_noabort(unsigned long addr, size_t size); + +void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); + #endif diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 80f23c9da6b0..51ec45407a0b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -29,6 +29,7 @@ #include <linux/kasan.h> #include <linux/module.h> #include <linux/sched/task_stack.h> +#include <linux/uaccess.h> #include <asm/sections.h> @@ -454,7 +455,7 @@ static void print_shadow_for_address(const void *addr) } } -bool report_enabled(void) +static bool report_enabled(void) { if (current->kasan_depth) return false; @@ -479,7 +480,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags); } -void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) +static void __kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) { struct kasan_access_info info; void *tagged_addr; @@ -518,6 +520,22 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon end_report(&flags); } +bool kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) +{ + unsigned long flags = user_access_save(); + bool ret = false; + + if (likely(report_enabled())) { + __kasan_report(addr, size, is_write, ip); + ret = true; + } + + user_access_restore(flags); + + return ret; +} + #ifdef CONFIG_KASAN_INLINE /* * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 25b7734e7013..8a959fdd30e3 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -12,7 +12,6 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define DISABLE_BRANCH_PROFILING #include <linux/export.h> #include <linux/interrupt.h> diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 99d77ffb79c2..b043c40a21d4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,8 @@ enum scan_result { SCAN_SUCCEED, SCAN_PMD_NULL, SCAN_EXCEED_NONE_PTE, + SCAN_EXCEED_SWAP_PTE, + SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PAGE_RO, @@ -47,7 +49,6 @@ enum scan_result { SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, - SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, }; @@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; +static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, khugepaged_max_ptes_swap_store); +static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_shared); +} + +static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_shared; + + err = kstrtoul(buf, 10, &max_ptes_shared); + if (err || max_ptes_shared > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_shared = max_ptes_shared; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_shared_attr = + __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show, + khugepaged_max_ptes_shared_store); + static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, - &khugepaged_max_ptes_swap_attr.attr, NULL, }; @@ -359,6 +389,7 @@ int __init khugepaged_init(void) khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } @@ -503,36 +534,61 @@ void __khugepaged_exit(struct mm_struct *mm) * under mmap sem read mode). Stop here (after we * return all pagetables will be destroyed) until * khugepaged has finished working on the pagetables - * under the mmap_sem. + * under the mmap_lock. */ - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); + mmap_write_lock(mm); + mmap_write_unlock(mm); } } static void release_pte_page(struct page *page) { - dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + -compound_nr(page)); unlock_page(page); putback_lru_page(page); } -static void release_pte_pages(pte_t *pte, pte_t *_pte) +static void release_pte_pages(pte_t *pte, pte_t *_pte, + struct list_head *compound_pagelist) { + struct page *page, *tmp; + while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) - release_pte_page(pte_page(pteval)); + + page = pte_page(pteval); + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && + !PageCompound(page)) + release_pte_page(page); + } + + list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { + list_del(&page->lru); + release_pte_page(page); } } +static bool is_refcount_suitable(struct page *page) +{ + int expected_refcount; + + expected_refcount = total_mapcount(page); + if (PageSwapCache(page)) + expected_refcount += compound_nr(page); + + return page_count(page) == expected_refcount; +} + static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) + pte_t *pte, + struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; @@ -558,13 +614,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + VM_BUG_ON_PAGE(!PageAnon(page), page); + + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; goto out; } - VM_BUG_ON_PAGE(!PageAnon(page), page); + if (PageCompound(page)) { + struct page *p; + page = compound_head(page); + + /* + * Check if we have dealt with the compound page + * already + */ + list_for_each_entry(p, compound_pagelist, lru) { + if (page == p) + goto next; + } + } /* * We can do it before isolate_lru_page because the @@ -578,28 +648,30 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * The page table that maps the page has been already unlinked + * from the page table tree and this process cannot get + * an additinal pin on the page. + * + * New pins can come later if the page is shared across fork, + * but not from this process. The other process cannot write to + * the page, only trigger CoW. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (!is_refcount_suitable(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; } - if (pte_write(pteval)) { - writable = true; - } else { - if (PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } + if (!pte_write(pteval) && PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { /* - * Page is not in the swap cache. It can be collapsed - * into a THP. + * Page is in the swap cache and cannot be re-used. + * It cannot be collapsed into a THP. */ + unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; + goto out; } /* @@ -611,16 +683,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_lru(page)); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + compound_nr(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); + if (PageCompound(page)) + list_add_tail(&page->lru, compound_pagelist); +next: /* There should be enough young pte to collapse the page */ if (pte_young(pteval) || page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced++; + + if (pte_write(pteval)) + writable = true; } if (likely(writable)) { if (likely(referenced)) { @@ -634,7 +713,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } out: - release_pte_pages(pte, _pte); + release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); return 0; @@ -643,13 +722,14 @@ out: static void __collapse_huge_page_copy(pte_t *pte, struct page *page, struct vm_area_struct *vma, unsigned long address, - spinlock_t *ptl) + spinlock_t *ptl, + struct list_head *compound_pagelist) { + struct page *src_page, *tmp; pte_t *_pte; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, page++, address += PAGE_SIZE) { pte_t pteval = *_pte; - struct page *src_page; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); @@ -669,8 +749,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); - release_pte_page(src_page); + if (!PageCompound(src_page)) + release_pte_page(src_page); /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats @@ -687,6 +767,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, free_page_and_swap_cache(src_page); } } + + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + release_pte_page(src_page); + } } static void khugepaged_alloc_sleep(void) @@ -848,8 +933,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) #endif /* - * If mmap_sem temporarily dropped, revalidate vma - * before taking mmap_sem. + * If mmap_lock temporarily dropped, revalidate vma + * before taking mmap_lock. * Return 0 if succeeds, otherwise return none-zero * value (scan code). */ @@ -881,7 +966,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * Only done if khugepaged_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held, - * but with mmap_sem held to protect against vma changes. + * but with mmap_lock held to protect against vma changes. */ static bool __collapse_huge_page_swapin(struct mm_struct *mm, @@ -899,11 +984,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pgoff = linear_page_index(vma, address), }; - /* we only decide to swapin, if there is enough young ptes */ - if (referenced < HPAGE_PMD_NR/2) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } vmf.pte = pte_offset_map(pmd, address); for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; vmf.pte++, vmf.address += PAGE_SIZE) { @@ -913,9 +993,9 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, swapped_in++; ret = do_swap_page(&vmf); - /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ + /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ if (ret & VM_FAULT_RETRY) { - down_read(&mm->mmap_sem); + mmap_read_lock(mm); if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); @@ -936,6 +1016,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, } vmf.pte--; pte_unmap(vmf.pte); + + /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ + if (swapped_in) + lru_add_drain(); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } @@ -943,15 +1028,15 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - int node, int referenced) + int node, int referenced, int unmapped) { + LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; - struct mem_cgroup *memcg; struct vm_area_struct *vma; struct mmu_notifier_range range; gfp_t gfp; @@ -962,57 +1047,56 @@ static void collapse_huge_page(struct mm_struct *mm, gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; /* - * Before allocating the hugepage, release the mmap_sem read lock. + * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves - * sync compaction, and we do not need to hold the mmap_sem during + * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode. */ - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); if (result) { - mem_cgroup_cancel_charge(new_page, memcg, true); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); goto out_nolock; } pmd = mm_find_pmd(mm, address); if (!pmd) { result = SCAN_PMD_NULL; - mem_cgroup_cancel_charge(new_page, memcg, true); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); goto out_nolock; } /* - * __collapse_huge_page_swapin always returns with mmap_sem locked. - * If it fails, we release mmap_sem and jump out_nolock. + * __collapse_huge_page_swapin always returns with mmap_lock locked. + * If it fails, we release mmap_lock and jump out_nolock. * Continuing to collapse causes inconsistency. */ - if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mem_cgroup_cancel_charge(new_page, memcg, true); - up_read(&mm->mmap_sem); + if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, + pmd, referenced)) { + mmap_read_unlock(mm); goto out_nolock; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. */ - down_write(&mm->mmap_sem); + mmap_write_lock(mm); result = SCAN_ANY_PROCESS; if (!mmget_still_valid(mm)) goto out; @@ -1044,7 +1128,8 @@ static void collapse_huge_page(struct mm_struct *mm, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte); + isolated = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist); spin_unlock(pte_ptl); if (unlikely(!isolated)) { @@ -1069,7 +1154,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -1087,8 +1173,6 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -1100,12 +1184,13 @@ static void collapse_huge_page(struct mm_struct *mm, khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); out_nolock: + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; out: - mem_cgroup_cancel_charge(new_page, memcg, true); goto out_up_write; } @@ -1116,7 +1201,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0, result = 0, referenced = 0; + int ret = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; spinlock_t *ptl; @@ -1188,12 +1274,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; goto out_unmap; } + page = compound_head(page); + /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. @@ -1220,11 +1308,23 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * Here the check is racy it may see totmal_mapcount > refcount + * in some cases. + * For example, one process with one forked child process. + * The parent has the PMD split due to MADV_DONTNEED, then + * the child is trying unmap the whole PMD, but khugepaged + * may be scanning the parent between the child has + * PageDoubleMap flag cleared and dec the mapcount. So + * khugepaged may see total_mapcount > refcount. + * + * But such case is ephemeral we could always retry collapse + * later. However it may report false positive if the page + * has excessive GUP pins (i.e. 512). Anyway the same check + * will be done again later the risk seems low. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (!is_refcount_suitable(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1233,22 +1333,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, mmu_notifier_test_young(vma->vm_mm, address)) referenced++; } - if (writable) { - if (referenced) { - result = SCAN_SUCCEED; - ret = 1; - } else { - result = SCAN_LACK_REFERENCED_PAGE; - } - } else { + if (!writable) { result = SCAN_PAGE_RO; + } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { node = khugepaged_find_target_node(); - /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, node, referenced); + /* collapse_huge_page will return with the mmap_lock released */ + collapse_huge_page(mm, address, hpage, node, + referenced, unmapped); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1418,7 +1517,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) if (likely(mm_slot->nr_pte_mapped_thp == 0)) return 0; - if (!down_write_trylock(&mm->mmap_sem)) + if (!mmap_write_trylock(mm)) return -EBUSY; if (unlikely(khugepaged_test_exit(mm))) @@ -1429,7 +1528,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) out: mm_slot->nr_pte_mapped_thp = 0; - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return 0; } @@ -1444,11 +1543,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth investing - * down_write(mmap_sem) as PMD-mapping is likely to be split + * mmap_write_lock(mm) as PMD-mapping is likely to be split * later. * * Not that vma->anon_vma check is racy: it can be set up after - * the check but before we took mmap_sem by the fault path. + * the check but before we took mmap_lock by the fault path. * But page lock would prevent establishing any new ptes of the * page, so we are safe. * @@ -1468,18 +1567,18 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (!pmd) continue; /* - * We need exclusive mmap_sem to retract page table. + * We need exclusive mmap_lock to retract page table. * * We use trylock due to lock inversion: we need to acquire - * mmap_sem while holding page lock. Fault path does it in + * mmap_lock while holding page lock. Fault path does it in * reverse order. Trylock is a way to avoid deadlock. */ - if (down_write_trylock(&vma->vm_mm->mmap_sem)) { + if (mmap_write_trylock(vma->vm_mm)) { spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); - up_write(&vma->vm_mm->mmap_sem); + mmap_write_unlock(vma->vm_mm); mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); } else { @@ -1515,7 +1614,6 @@ static void collapse_file(struct mm_struct *mm, struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; - struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1534,10 +1632,11 @@ static void collapse_file(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } + count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* This will be less messy when we use multi-index entries */ do { @@ -1547,7 +1646,6 @@ static void collapse_file(struct mm_struct *mm, break; xas_unlock_irq(&xas); if (!xas_nomem(&xas, GFP_KERNEL)) { - mem_cgroup_cancel_charge(new_page, memcg, true); result = SCAN_FAIL; goto out; } @@ -1692,6 +1790,7 @@ static void collapse_file(struct mm_struct *mm, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; + putback_lru_page(page); goto out_unlock; } @@ -1740,12 +1839,9 @@ out_unlock: } if (nr_none) { - struct zone *zone = page_zone(new_page); - - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); if (is_shmem) - __mod_node_page_state(zone->zone_pgdat, - NR_SHMEM, nr_none); + __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } xa_locked: @@ -1783,15 +1879,9 @@ xa_unlocked: SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - mem_cgroup_commit_charge(new_page, memcg, false, true); - - if (is_shmem) { + if (is_shmem) set_page_dirty(new_page); - lru_cache_add_anon(new_page); - } else { - lru_cache_add_file(new_page); - } - count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); + lru_cache_add(new_page); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -1838,13 +1928,14 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - mem_cgroup_cancel_charge(new_page, memcg, true); new_page->mapping = NULL; } unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); + if (!IS_ERR_OR_NULL(*hpage)) + mem_cgroup_uncharge(*hpage); /* TODO: tracepoints */ } @@ -1966,8 +2057,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, * the next mm on the list. */ vma = NULL; - if (unlikely(!down_read_trylock(&mm->mmap_sem))) - goto breakouterloop_mmap_sem; + if (unlikely(!mmap_read_trylock(mm))) + goto breakouterloop_mmap_lock; if (likely(!khugepaged_test_exit(mm))) vma = find_vma(mm, khugepaged_scan.address); @@ -2011,7 +2102,7 @@ skip: pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); ret = 1; khugepaged_scan_file(mm, file, pgoff, hpage); fput(file); @@ -2024,15 +2115,15 @@ skip: khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; if (ret) - /* we released mmap_sem so break loop */ - goto breakouterloop_mmap_sem; + /* we released mmap_lock so break loop */ + goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; } } breakouterloop: - up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ -breakouterloop_mmap_sem: + mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); @@ -2083,6 +2174,8 @@ static void khugepaged_do_scan(void) barrier(); /* write khugepaged_pages_to_scan to local stack */ + lru_add_drain_all(); + while (progress < pages) { if (!khugepaged_prealloc_page(&hpage, &wait)) break; @@ -442,7 +442,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, - * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * takes mmap_lock briefly to serialize against them. ksm_exit() does not set * a special flag: they can just back out as soon as mm_users goes to zero. * ksm_test_exit() is used throughout to make this test for exit: in some * places for correctness, in some places just to avoid unnecessary work. @@ -542,11 +542,11 @@ static void break_cow(struct rmap_item *rmap_item) */ put_anon_vma(rmap_item->anon_vma); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) break_ksm(vma, addr); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } static struct page *get_mergeable_page(struct rmap_item *rmap_item) @@ -556,7 +556,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) struct vm_area_struct *vma; struct page *page; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; @@ -572,7 +572,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) out: page = NULL; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return page; } @@ -612,7 +612,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages - * that are wrprotected and have the exact same + * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); @@ -831,7 +831,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm - * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). * @@ -976,7 +976,7 @@ static int unmerge_and_remove_all_rmap_items(void) for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { mm = mm_slot->mm; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (ksm_test_exit(mm)) break; @@ -989,7 +989,7 @@ static int unmerge_and_remove_all_rmap_items(void) } remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -1012,7 +1012,7 @@ static int unmerge_and_remove_all_rmap_items(void) return 0; error: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); @@ -1148,7 +1148,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, /* * No need to check ksm_use_zero_pages here: we can only have a - * zero_page here if ksm_use_zero_pages was enabled alreaady. + * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { get_page(kpage); @@ -1280,7 +1280,7 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, struct vm_area_struct *vma; int err = -EFAULT; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; @@ -1292,11 +1292,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); - /* Must get reference to anon_vma while still holding mmap_sem */ + /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; } @@ -1608,7 +1608,7 @@ again: * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ tree_page = get_ksm_page(stable_node_any, @@ -1843,7 +1843,7 @@ again: * continue. All KSM pages belonging to the * stable_node dups in a stable_node chain * have the same content and they're - * wrprotected at all times. Any will work + * write protected at all times. Any will work * fine to continue the walk. */ tree_page = get_ksm_page(stable_node_any, @@ -2001,7 +2001,7 @@ static void stable_tree_append(struct rmap_item *rmap_item, * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check - * for other negative values as an undeflow if detected here + * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ @@ -2110,11 +2110,19 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) if (ksm_use_zero_pages && (checksum == zero_checksum)) { struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); - err = try_to_merge_one_page(vma, page, - ZERO_PAGE(rmap_item->address)); - up_read(&mm->mmap_sem); + if (vma) { + err = try_to_merge_one_page(vma, page, + ZERO_PAGE(rmap_item->address)); + } else { + /* + * If the vma is out of date, we do not need to + * continue. + */ + err = 0; + } + mmap_read_unlock(mm); /* * In case of failure, the page was not really empty, so we * need to continue. Otherwise we're done. @@ -2277,7 +2285,7 @@ next_mm: } mm = slot->mm; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); if (ksm_test_exit(mm)) vma = NULL; else @@ -2311,7 +2319,7 @@ next_mm: ksm_scan.address += PAGE_SIZE; } else put_page(*page); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return rmap_item; } put_page(*page); @@ -2335,13 +2343,13 @@ next_mm: struct mm_slot, mm_list); if (ksm_scan.address == 0) { /* - * We've completed a full scan of all vmas, holding mmap_sem + * We've completed a full scan of all vmas, holding mmap_lock * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and - * mmap_sem then protects against race with MADV_MERGEABLE). + * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&slot->link); list_del(&slot->mm_list); @@ -2349,12 +2357,12 @@ next_mm: free_mm_slot(slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmdrop(mm); } else { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* - * up_read(&mm->mmap_sem) first because after + * mmap_read_unlock(mm) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and @@ -2528,7 +2536,7 @@ void __ksm_exit(struct mm_struct *mm) * This process is exiting: if it's straightforward (as is the * case when ksmd was never running), free mm_slot immediately. * But if it's at the cursor or has rmap_items linked to it, use - * mmap_sem to synchronize with any break_cows before pagetables + * mmap_lock to synchronize with any break_cows before pagetables * are freed, and leave the mm_slot on the list for ksmd to free. * Beware: ksm may already have noticed it exiting and freed the slot. */ @@ -2552,8 +2560,8 @@ void __ksm_exit(struct mm_struct *mm) clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); + mmap_write_lock(mm); + mmap_write_unlock(mm); } } diff --git a/mm/list_lru.c b/mm/list_lru.c index 4d5294c39bba..9222910ab1cb 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -213,7 +213,7 @@ restart: /* * decrement nr_to_walk first so that we don't livelock if we - * get stuck on large numbesr of LRU_RETRY items + * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; diff --git a/mm/maccess.c b/mm/maccess.c index 3ca8d97e5010..f98ff91e32c6 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -1,104 +1,130 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Access kernel memory without faulting. + * Access kernel or user memory without faulting. */ #include <linux/export.h> #include <linux/mm.h> #include <linux/uaccess.h> -static __always_inline long -probe_read_common(void *dst, const void __user *src, size_t size) +bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, + size_t size) { - long ret; + return true; +} + +#ifdef HAVE_GET_KERNEL_NOFAULT + +#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ + while (len >= sizeof(type)) { \ + __get_kernel_nofault(dst, src, type, err_label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +long copy_from_kernel_nofault(void *dst, const void *src, size_t size) +{ + if (!copy_from_kernel_nofault_allowed(src, size)) + return -ERANGE; pagefault_disable(); - ret = __copy_from_user_inatomic(dst, src, size); + copy_from_kernel_nofault_loop(dst, src, size, u64, Efault); + copy_from_kernel_nofault_loop(dst, src, size, u32, Efault); + copy_from_kernel_nofault_loop(dst, src, size, u16, Efault); + copy_from_kernel_nofault_loop(dst, src, size, u8, Efault); + pagefault_enable(); + return 0; +Efault: pagefault_enable(); + return -EFAULT; +} +EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); + +#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ + while (len >= sizeof(type)) { \ + __put_kernel_nofault(dst, src, type, err_label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } - return ret ? -EFAULT : 0; +long copy_to_kernel_nofault(void *dst, const void *src, size_t size) +{ + pagefault_disable(); + copy_to_kernel_nofault_loop(dst, src, size, u64, Efault); + copy_to_kernel_nofault_loop(dst, src, size, u32, Efault); + copy_to_kernel_nofault_loop(dst, src, size, u16, Efault); + copy_to_kernel_nofault_loop(dst, src, size, u8, Efault); + pagefault_enable(); + return 0; +Efault: + pagefault_enable(); + return -EFAULT; } -static __always_inline long -probe_write_common(void __user *dst, const void *src, size_t size) +long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { - long ret; + const void *src = unsafe_addr; + + if (unlikely(count <= 0)) + return 0; + if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) + return -ERANGE; pagefault_disable(); - ret = __copy_to_user_inatomic(dst, src, size); + do { + __get_kernel_nofault(dst, src, u8, Efault); + dst++; + src++; + } while (dst[-1] && src - unsafe_addr < count); pagefault_enable(); - return ret ? -EFAULT : 0; + dst[-1] = '\0'; + return src - unsafe_addr; +Efault: + pagefault_enable(); + dst[-1] = '\0'; + return -EFAULT; } - +#else /* HAVE_GET_KERNEL_NOFAULT */ /** - * probe_kernel_read(): safely attempt to read from a kernel-space location + * copy_from_kernel_nofault(): safely attempt to read from kernel-space * @dst: pointer to the buffer that shall take the data * @src: address to read from * @size: size of the data chunk * - * Safely read from address @src to the buffer at @dst. If a kernel fault - * happens, handle that and return -EFAULT. + * Safely read from kernel address @src to the buffer at @dst. If a kernel + * fault happens, handle that and return -EFAULT. If @src is not a valid kernel + * address, return -ERANGE. * * We ensure that the copy_from_user is executed in atomic context so that - * do_page_fault() doesn't attempt to take mmap_sem. This makes - * probe_kernel_read() suitable for use within regions where the caller - * already holds mmap_sem, or other locks which nest inside mmap_sem. - * - * probe_kernel_read_strict() is the same as probe_kernel_read() except for - * the case where architectures have non-overlapping user and kernel address - * ranges: probe_kernel_read_strict() will additionally return -EFAULT for - * probing memory on a user address range where probe_user_read() is supposed - * to be used instead. + * do_page_fault() doesn't attempt to take mmap_lock. This makes + * copy_from_kernel_nofault() suitable for use within regions where the caller + * already holds mmap_lock, or other locks which nest inside mmap_lock. */ - -long __weak probe_kernel_read(void *dst, const void *src, size_t size) - __attribute__((alias("__probe_kernel_read"))); - -long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size) - __attribute__((alias("__probe_kernel_read"))); - -long __probe_kernel_read(void *dst, const void *src, size_t size) +long copy_from_kernel_nofault(void *dst, const void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = probe_read_common(dst, (__force const void __user *)src, size); - set_fs(old_fs); - - return ret; -} -EXPORT_SYMBOL_GPL(probe_kernel_read); - -/** - * probe_user_read(): safely attempt to read from a user-space location - * @dst: pointer to the buffer that shall take the data - * @src: address to read from. This must be a user address. - * @size: size of the data chunk - * - * Safely read from user address @src to the buffer at @dst. If a kernel fault - * happens, handle that and return -EFAULT. - */ - -long __weak probe_user_read(void *dst, const void __user *src, size_t size) - __attribute__((alias("__probe_user_read"))); - -long __probe_user_read(void *dst, const void __user *src, size_t size) -{ - long ret = -EFAULT; - mm_segment_t old_fs = get_fs(); + if (!copy_from_kernel_nofault_allowed(src, size)) + return -ERANGE; - set_fs(USER_DS); - if (access_ok(src, size)) - ret = probe_read_common(dst, src, size); + set_fs(KERNEL_DS); + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, (__force const void __user *)src, + size); + pagefault_enable(); set_fs(old_fs); - return ret; + if (ret) + return -EFAULT; + return 0; } -EXPORT_SYMBOL_GPL(probe_user_read); +EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); /** - * probe_kernel_write(): safely attempt to write to a location + * copy_to_kernel_nofault(): safely attempt to write to a location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk @@ -106,52 +132,25 @@ EXPORT_SYMBOL_GPL(probe_user_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ - -long __weak probe_kernel_write(void *dst, const void *src, size_t size) - __attribute__((alias("__probe_kernel_write"))); - -long __probe_kernel_write(void *dst, const void *src, size_t size) +long copy_to_kernel_nofault(void *dst, const void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - ret = probe_write_common((__force void __user *)dst, src, size); - set_fs(old_fs); - - return ret; -} -EXPORT_SYMBOL_GPL(probe_kernel_write); - -/** - * probe_user_write(): safely attempt to write to a user-space location - * @dst: address to write to - * @src: pointer to the data that shall be written - * @size: size of the data chunk - * - * Safely write to address @dst from the buffer at @src. If a kernel fault - * happens, handle that and return -EFAULT. - */ - -long __weak probe_user_write(void __user *dst, const void *src, size_t size) - __attribute__((alias("__probe_user_write"))); - -long __probe_user_write(void __user *dst, const void *src, size_t size) -{ - long ret = -EFAULT; - mm_segment_t old_fs = get_fs(); - - set_fs(USER_DS); - if (access_ok(dst, size)) - ret = probe_write_common(dst, src, size); + pagefault_disable(); + ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); + pagefault_enable(); set_fs(old_fs); - return ret; + if (ret) + return -EFAULT; + return 0; } -EXPORT_SYMBOL_GPL(probe_user_write); /** - * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. + * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe + * address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @unsafe_addr: Unsafe address. @@ -161,27 +160,14 @@ EXPORT_SYMBOL_GPL(probe_user_write); * * On success, returns the length of the string INCLUDING the trailing NUL. * - * If access fails, returns -EFAULT (some data may have been copied - * and the trailing NUL added). + * If access fails, returns -EFAULT (some data may have been copied and the + * trailing NUL added). If @unsafe_addr is not a valid kernel address, return + * -ERANGE. * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. - * - * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except - * for the case where architectures have non-overlapping user and kernel address - * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for - * probing memory on a user address range where strncpy_from_unsafe_user() is - * supposed to be used instead. */ - -long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) - __attribute__((alias("__strncpy_from_unsafe"))); - -long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, - long count) - __attribute__((alias("__strncpy_from_unsafe"))); - -long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) +long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); const void *src = unsafe_addr; @@ -189,6 +175,8 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) if (unlikely(count <= 0)) return 0; + if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) + return -ERANGE; set_fs(KERNEL_DS); pagefault_disable(); @@ -203,9 +191,66 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) return ret ? -EFAULT : src - unsafe_addr; } +#endif /* HAVE_GET_KERNEL_NOFAULT */ + +/** + * copy_from_user_nofault(): safely attempt to read from a user-space location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from. This must be a user address. + * @size: size of the data chunk + * + * Safely read from user address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long copy_from_user_nofault(void *dst, const void __user *src, size_t size) +{ + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + if (access_ok(src, size)) { + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); + } + set_fs(old_fs); + + if (ret) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(copy_from_user_nofault); + +/** + * copy_to_user_nofault(): safely attempt to write to a user-space location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long copy_to_user_nofault(void __user *dst, const void *src, size_t size) +{ + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + if (access_ok(dst, size)) { + pagefault_disable(); + ret = __copy_to_user_inatomic(dst, src, size); + pagefault_enable(); + } + set_fs(old_fs); + + if (ret) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(copy_to_user_nofault); /** - * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user + * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user * address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. @@ -222,7 +267,7 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. */ -long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, +long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); @@ -248,7 +293,7 @@ long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, } /** - * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL. + * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL. * @unsafe_addr: The string to measure. * @count: Maximum count (including NUL) * @@ -263,7 +308,7 @@ long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, * Unlike strnlen_user, this can be used from IRQ handler etc. because * it disables pagefaults. */ -long strnlen_unsafe_user(const void __user *unsafe_addr, long count) +long strnlen_user_nofault(const void __user *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); int ret; diff --git a/mm/madvise.c b/mm/madvise.c index 4bb30ed6c8d2..dd1d43cf026d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -27,6 +27,7 @@ #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> +#include <linux/sched/mm.h> #include <asm/tlb.h> @@ -39,7 +40,7 @@ struct madvise_walk_private { /* * Any behaviour which results in changes to the vma->vm_flags needs to - * take mmap_sem for writing. Others, which simply traverse vmas, need + * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading. */ static int madvise_need_mmap_write(int behavior) @@ -164,7 +165,7 @@ static long madvise_behavior(struct vm_area_struct *vma, success: /* - * vm_flags is protected by the mmap_sem held in write mode. + * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; @@ -284,16 +285,16 @@ static long madvise_willneed(struct vm_area_struct *vma, * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop - * mmap_sem. + * mmap_lock. */ - *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + *prev = NULL; /* tell sys_madvise we drop mmap_lock */ get_file(file); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); return 0; } @@ -767,9 +768,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { - *prev = NULL; /* mmap_sem has been dropped, prev is stale */ + *prev = NULL; /* mmap_lock has been dropped, prev is stale */ - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = find_vma(current->mm, start); if (!vma) return -ENOMEM; @@ -790,7 +791,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old - * vma was splitted while the mmap_sem was + * vma was splitted while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an @@ -825,7 +826,7 @@ static long madvise_remove(struct vm_area_struct *vma, int error; struct file *f; - *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + *prev = NULL; /* tell sys_madvise we drop mmap_lock */ if (vma->vm_flags & VM_LOCKED) return -EINVAL; @@ -846,18 +847,18 @@ static long madvise_remove(struct vm_area_struct *vma, * Filesystem's fallocate may need to take i_mutex. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop - * mmap_sem. + * mmap_lock. */ get_file(f); if (userfaultfd_remove(vma, start, end)) { - /* mmap_sem was not released by userfaultfd_remove() */ - up_read(¤t->mm->mmap_sem); + /* mmap_lock was not released by userfaultfd_remove() */ + mmap_read_unlock(current->mm); } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); return error; } @@ -1088,10 +1089,27 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) write = madvise_need_mmap_write(behavior); if (write) { - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(current->mm)) return -EINTR; + + /* + * We may have stolen the mm from another process + * that is undergoing core dumping. + * + * Right now that's io_ring, in the future it may + * be remote process management and not "current" + * at all. + * + * We need to fix core dumping to not do this, + * but for now we have the mmget_still_valid() + * model. + */ + if (!mmget_still_valid(current->mm)) { + mmap_write_unlock(current->mm); + return -EINTR; + } } else { - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); } /* @@ -1135,15 +1153,15 @@ int do_madvise(unsigned long start, size_t len_in, int behavior) goto out; if (prev) vma = prev->vm_next; - else /* madvise_remove dropped mmap_sem */ + else /* madvise_remove dropped mmap_lock */ vma = find_vma(current->mm, start); } out: blk_finish_plug(&plug); if (write) - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); else - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); return error; } diff --git a/mm/memblock.c b/mm/memblock.c index c79ba6f9920c..39aceafc57f6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -78,7 +78,7 @@ * * memblock_alloc*() - these functions return the **virtual** address * of the allocated memory. * - * Note, that both API variants use implict assumptions about allowed + * Note, that both API variants use implicit assumptions about allowed * memory ranges and the fallback methods. Consult the documentation * of memblock_alloc_internal() and memblock_alloc_range_nid() * functions for more elaborate description. @@ -620,7 +620,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1197,7 +1197,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, *idx = ULLONG_MAX; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * Common iterator interface used to define for_each_mem_pfn_range(). */ @@ -1207,13 +1206,15 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, { struct memblock_type *type = &memblock.memory; struct memblock_region *r; + int r_nid; while (++*idx < type->cnt) { r = &type->regions[*idx]; + r_nid = memblock_get_region_node(r); if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r->nid) + if (nid == MAX_NUMNODES || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1226,7 +1227,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (out_end_pfn) *out_end_pfn = PFN_DOWN(r->base + r->size); if (out_nid) - *out_nid = r->nid; + *out_nid = r_nid; } /** @@ -1245,6 +1246,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { +#ifdef CONFIG_NEED_MULTIPLE_NODES int start_rgn, end_rgn; int i, ret; @@ -1256,9 +1258,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, memblock_set_region_node(&type->regions[i], nid); memblock_merge_regions(type); +#endif return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /** * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() @@ -1797,7 +1800,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr) return !memblock_is_nomap(&memblock.memory.regions[i]); } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { @@ -1810,9 +1812,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, *start_pfn = PFN_DOWN(type->regions[mid].base); *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); - return type->regions[mid].nid; + return memblock_get_region_node(&type->regions[mid]); } -#endif /** * memblock_is_region_memory - check if a region is a subset of memory @@ -1903,7 +1904,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5beea03dd58a..0b38b6ad547d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,9 +83,9 @@ static bool cgroup_memory_nokmem; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -int do_swap_account __read_mostly; +bool cgroup_memory_noswap __read_mostly; #else -#define do_swap_account 0 +#define cgroup_memory_noswap 1 #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -95,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } #define THRESHOLDS_EVENTS_TARGET 128 @@ -834,25 +834,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool compound, int nr_pages) + int nr_pages) { - /* - * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is - * counted as CACHE even if it's on ANON LRU. - */ - if (PageAnon(page)) - __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); - else { - __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); - if (PageSwapBacked(page)) - __mod_memcg_state(memcg, NR_SHMEM, nr_pages); - } - - if (compound) { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); - } - /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __count_memcg_events(memcg, PGPGIN, 1); @@ -1218,9 +1201,8 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, * @page: the page * @pgdat: pgdat of the page * - * This function is only safe when following the LRU page isolation - * and putback protocol: the LRU lock must be held, and the page must - * either be PageLRU() or the caller must have isolated/allocated it. + * This function relies on page->mem_cgroup being stable - see the + * access rules in commit_charge(). */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { @@ -1314,7 +1296,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.max); - if (count <= limit) + if (count < limit) margin = min(margin, limit - count); else margin = 0; @@ -1389,10 +1371,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg) */ seq_buf_printf(&s, "anon %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS) * + (u64)memcg_page_state(memcg, NR_ANON_MAPPED) * PAGE_SIZE); seq_buf_printf(&s, "file %llu\n", - (u64)memcg_page_state(memcg, MEMCG_CACHE) * + (u64)memcg_page_state(memcg, NR_FILE_PAGES) * PAGE_SIZE); seq_buf_printf(&s, "kernel_stack %llu\n", (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * @@ -1418,15 +1400,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg) (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); - /* - * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter - * with the NR_ANON_THP vm counter, but right now it's a pain in the - * arse because it requires migrating the work out of rmap to a place - * where the page->mem_cgroup is set up and stable. - */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE seq_buf_printf(&s, "anon_thp %llu\n", - (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * - PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_ANON_THPS) * + HPAGE_PMD_SIZE); +#endif for (i = 0; i < NR_LRU_LISTS; i++) seq_buf_printf(&s, "%s %llu\n", lru_list_name(i), @@ -1451,6 +1429,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_buf_printf(&s, "workingset_activate %lu\n", memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_restore %lu\n", + memcg_page_state(memcg, WORKINGSET_RESTORE)); seq_buf_printf(&s, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); @@ -1979,6 +1959,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) */ struct mem_cgroup *lock_page_memcg(struct page *page) { + struct page *head = compound_head(page); /* rmap on tail pages */ struct mem_cgroup *memcg; unsigned long flags; @@ -1998,7 +1979,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = page->mem_cgroup; + memcg = head->mem_cgroup; if (unlikely(!memcg)) return NULL; @@ -2006,7 +1987,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != page->mem_cgroup) { + if (memcg != head->mem_cgroup) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2049,7 +2030,9 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) */ void unlock_page_memcg(struct page *page) { - __unlock_page_memcg(page->mem_cgroup); + struct page *head = compound_head(page); + + __unlock_page_memcg(head->mem_cgroup); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2250,7 +2233,8 @@ static void reclaim_high(struct mem_cgroup *memcg, gfp_t gfp_mask) { do { - if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high)) + if (page_counter_read(&memcg->memory) <= + READ_ONCE(memcg->memory.high)) continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); @@ -2319,41 +2303,64 @@ static void high_work_func(struct work_struct *work) #define MEMCG_DELAY_PRECISION_SHIFT 20 #define MEMCG_DELAY_SCALING_SHIFT 14 -/* - * Get the number of jiffies that we should penalise a mischievous cgroup which - * is exceeding its memory.high by checking both it and its ancestors. - */ -static unsigned long calculate_high_delay(struct mem_cgroup *memcg, - unsigned int nr_pages) +static u64 calculate_overage(unsigned long usage, unsigned long high) { - unsigned long penalty_jiffies; - u64 max_overage = 0; + u64 overage; - do { - unsigned long usage, high; - u64 overage; + if (usage <= high) + return 0; - usage = page_counter_read(&memcg->memory); - high = READ_ONCE(memcg->high); + /* + * Prevent division by 0 in overage calculation by acting as if + * it was a threshold of 1 page + */ + high = max(high, 1UL); - if (usage <= high) - continue; + overage = usage - high; + overage <<= MEMCG_DELAY_PRECISION_SHIFT; + return div64_u64(overage, high); +} - /* - * Prevent division by 0 in overage calculation by acting as if - * it was a threshold of 1 page - */ - high = max(high, 1UL); +static u64 mem_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->memory), + READ_ONCE(memcg->memory.high)); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} - overage = usage - high; - overage <<= MEMCG_DELAY_PRECISION_SHIFT; - overage = div64_u64(overage, high); +static u64 swap_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; - if (overage > max_overage) - max_overage = overage; + do { + overage = calculate_overage(page_counter_read(&memcg->swap), + READ_ONCE(memcg->swap.high)); + if (overage) + memcg_memory_event(memcg, MEMCG_SWAP_HIGH); + max_overage = max(overage, max_overage); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); + return max_overage; +} + +/* + * Get the number of jiffies that we should penalise a mischievous cgroup which + * is exceeding its memory.high by checking both it and its ancestors. + */ +static unsigned long calculate_high_delay(struct mem_cgroup *memcg, + unsigned int nr_pages, + u64 max_overage) +{ + unsigned long penalty_jiffies; + if (!max_overage) return 0; @@ -2377,14 +2384,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or * larger the current charge patch is than that. */ - penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; - - /* - * Clamp the max delay per usermode return so as to still keep the - * application moving forwards and also permit diagnostics, albeit - * extremely slowly. - */ - return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; } /* @@ -2409,7 +2409,18 @@ void mem_cgroup_handle_over_high(void) * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. */ - penalty_jiffies = calculate_high_delay(memcg, nr_pages); + penalty_jiffies = calculate_high_delay(memcg, nr_pages, + mem_find_max_overage(memcg)); + + penalty_jiffies += calculate_high_delay(memcg, nr_pages, + swap_find_max_overage(memcg)); + + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); /* * Don't sleep if the amount of jiffies this memcg owes us is so low @@ -2594,12 +2605,32 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) { - /* Don't bother a random interrupted task */ - if (in_interrupt()) { + bool mem_high, swap_high; + + mem_high = page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high); + swap_high = page_counter_read(&memcg->swap) > + READ_ONCE(memcg->swap.high); + + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + if (mem_high) { schedule_work(&memcg->high_work); break; } + continue; + } + + if (mem_high || swap_high) { + /* + * The allocating tasks in this cgroup will need to do + * reclaim or be throttled to prevent further growth + * of the memory or swap footprints. + * + * Target some best-effort fairness between the tasks, + * and distribute reclaim work and delay penalties + * based on how much each task is actually allocating. + */ current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -2609,6 +2640,7 @@ done_restock: return 0; } +#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) @@ -2620,70 +2652,20 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) css_put_many(&memcg->css, nr_pages); } +#endif -static void lock_page_lru(struct page *page, int *isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - spin_lock_irq(&pgdat->lru_lock); - if (PageLRU(page)) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - *isolated = 1; - } else - *isolated = 0; -} - -static void unlock_page_lru(struct page *page, int isolated) -{ - pg_data_t *pgdat = page_pgdat(page); - - if (isolated) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - add_page_to_lru_list(page, lruvec, page_lru(page)); - } - spin_unlock_irq(&pgdat->lru_lock); -} - -static void commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) +static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - int isolated; - VM_BUG_ON_PAGE(page->mem_cgroup, page); - /* - * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page - * may already be on some other mem_cgroup's LRU. Take care of it. - */ - if (lrucare) - lock_page_lru(page, &isolated); - - /* - * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point: - * - * - the page is uncharged + * Any of the following ensures page->mem_cgroup stability: * - * - the page is off-LRU - * - * - an anonymous fault has exclusive page access, except for - * a locked page table - * - * - a page cache insertion, a swapin fault, or a migration - * have the page locked + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference */ page->mem_cgroup = memcg; - - if (lrucare) - unlock_page_lru(page, isolated); } #ifdef CONFIG_MEMCG_KMEM @@ -2802,7 +2784,12 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, static inline bool memcg_kmem_bypass(void) { - if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + if (in_interrupt()) + return true; + + /* Allow remote memcg charging in kthread contexts. */ + if ((!current->mm || (current->flags & PF_KTHREAD)) && + !current->active_memcg) return true; return false; } @@ -3015,8 +3002,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - - __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3201,7 +3186,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * Test whether @memcg has children, dead or alive. Note that this * function doesn't care whether @memcg has use_hierarchy enabled and * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsiblity. + * hierarchy. Testing use_hierarchy is the caller's responsibility. */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { @@ -3299,8 +3284,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - val = memcg_page_state(memcg, MEMCG_CACHE) + - memcg_page_state(memcg, MEMCG_RSS); + val = memcg_page_state(memcg, NR_FILE_PAGES) + + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) val += memcg_page_state(memcg, MEMCG_SWAP); } else { @@ -3688,7 +3673,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) + int nid, unsigned int lru_mask, bool tree) { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); unsigned long nr = 0; @@ -3699,13 +3684,17 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); + if (tree) + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + else + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); } return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) + unsigned int lru_mask, + bool tree) { unsigned long nr = 0; enum lru_list lru; @@ -3713,7 +3702,10 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); + if (tree) + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + else + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); } return nr; } @@ -3733,34 +3725,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) }; const struct numa_stat *stat; int nid; - unsigned long nr; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); - seq_printf(m, "%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } + seq_printf(m, "%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + false)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, false)); seq_putc(m, '\n'); } for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - struct mem_cgroup *iter; - - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); - seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_node_nr_lru_pages( - iter, nid, stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } + + seq_printf(m, "hierarchical_%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + true)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, true)); seq_putc(m, '\n'); } @@ -3769,9 +3755,11 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ static const unsigned int memcg1_stats[] = { - MEMCG_CACHE, - MEMCG_RSS, - MEMCG_RSS_HUGE, + NR_FILE_PAGES, + NR_ANON_MAPPED, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + NR_ANON_THPS, +#endif NR_SHMEM, NR_FILE_MAPPED, NR_FILE_DIRTY, @@ -3782,7 +3770,9 @@ static const unsigned int memcg1_stats[] = { static const char *const memcg1_stat_names[] = { "cache", "rss", +#ifdef CONFIG_TRANSPARENT_HUGEPAGE "rss_huge", +#endif "shmem", "mapped_file", "dirty", @@ -3808,11 +3798,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + unsigned long nr; + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], - memcg_page_state_local(memcg, memcg1_stats[i]) * - PAGE_SIZE); + nr = memcg_page_state_local(memcg, memcg1_stats[i]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (memcg1_stats[i] == NR_ANON_THPS) + nr *= HPAGE_PMD_NR; +#endif + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -3858,23 +3853,17 @@ static int memcg_stat_show(struct seq_file *m, void *v) { pg_data_t *pgdat; struct mem_cgroup_per_node *mz; - struct zone_reclaim_stat *rstat; - unsigned long recent_rotated[2] = {0, 0}; - unsigned long recent_scanned[2] = {0, 0}; + unsigned long anon_cost = 0; + unsigned long file_cost = 0; for_each_online_pgdat(pgdat) { mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += rstat->recent_rotated[0]; - recent_rotated[1] += rstat->recent_rotated[1]; - recent_scanned[0] += rstat->recent_scanned[0]; - recent_scanned[1] += rstat->recent_scanned[1]; + anon_cost += mz->lruvec.anon_cost; + file_cost += mz->lruvec.file_cost; } - seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); - seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); - seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); - seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); + seq_printf(m, "anon_cost %lu\n", anon_cost); + seq_printf(m, "file_cost %lu\n", file_cost); } #endif @@ -4330,7 +4319,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); - /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); @@ -4338,7 +4326,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(READ_ONCE(memcg->memory.max), - READ_ONCE(memcg->high)); + READ_ONCE(memcg->memory.high)); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4850,7 +4838,7 @@ static struct cftype mem_cgroup_legacy_files[] = { * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of * memory-controlled cgroups to 64k. * - * However, there usually are many references to the oflline CSS after + * However, there usually are many references to the offline CSS after * the cgroup has been destroyed, such as page cache or reclaimable * slab objects, that don't need to hang on to the ID. We want to keep * those dead CSS from occupying IDs, or we might quickly exhaust the @@ -4990,19 +4978,22 @@ static struct mem_cgroup *mem_cgroup_alloc(void) unsigned int size; int node; int __maybe_unused i; + long error = -ENOMEM; size = sizeof(struct mem_cgroup); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); memcg = kzalloc(size, GFP_KERNEL); if (!memcg) - return NULL; + return ERR_PTR(error); memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, GFP_KERNEL); - if (memcg->id.id < 0) + if (memcg->id.id < 0) { + error = memcg->id.id; goto fail; + } memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); if (!memcg->vmstats_local) @@ -5046,7 +5037,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) fail: mem_cgroup_id_remove(memcg); __mem_cgroup_free(memcg); - return NULL; + return ERR_PTR(error); } static struct cgroup_subsys_state * __ref @@ -5057,11 +5048,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) long error = -ENOMEM; memcg = mem_cgroup_alloc(); - if (!memcg) - return ERR_PTR(error); + if (IS_ERR(memcg)) + return ERR_CAST(memcg); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; @@ -5108,7 +5100,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) fail: mem_cgroup_id_remove(memcg); mem_cgroup_free(memcg); - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); } static int mem_cgroup_css_online(struct cgroup_subsys_state *css) @@ -5213,8 +5205,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); memcg_wb_domain_size_changed(memcg); } @@ -5305,8 +5298,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); - if (do_memsw_account()) - entry->val = ent.val; + entry->val = ent.val; return page; } @@ -5340,8 +5332,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, page = find_get_entry(mapping, pgoff); if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); - if (do_memsw_account()) - *entry = swp; + *entry = swp; page = find_get_page(swap_address_space(swp), swp_offset(swp)); } @@ -5372,10 +5363,8 @@ static int mem_cgroup_move_account(struct page *page, { struct lruvec *from_vec, *to_vec; struct pglist_data *pgdat; - unsigned long flags; unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; - bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5393,30 +5382,47 @@ static int mem_cgroup_move_account(struct page *page, if (page->mem_cgroup != from) goto out_unlock; - anon = PageAnon(page); - pgdat = page_pgdat(page); from_vec = mem_cgroup_lruvec(from, pgdat); to_vec = mem_cgroup_lruvec(to, pgdat); - spin_lock_irqsave(&from->move_lock, flags); + lock_page_memcg(page); - if (!anon && page_mapped(page)) { - __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); - } + if (PageAnon(page)) { + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); + if (PageTransHuge(page)) { + __mod_lruvec_state(from_vec, NR_ANON_THPS, + -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_THPS, + nr_pages); + } - /* - * move_lock grabbed above and caller set from->moving_account, so - * mod_memcg_page_state will serialize updates to PageDirty. - * So mapping should be stable for dirty pages. - */ - if (!anon && PageDirty(page)) { - struct address_space *mapping = page_mapping(page); + } + } else { + __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); - if (mapping_cap_account_dirty(mapping)) { - __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages); + if (PageSwapBacked(page)) { + __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); + __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); + } + + if (page_mapped(page)) { + __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); + } + + if (PageDirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + __mod_lruvec_state(from_vec, NR_FILE_DIRTY, + -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_DIRTY, + nr_pages); + } } } @@ -5426,22 +5432,30 @@ static int mem_cgroup_move_account(struct page *page, } /* + * All state has been migrated, let's switch to the new memcg. + * * It is safe to change page->mem_cgroup here because the page - * is referenced, charged, and isolated - we can't race with - * uncharging, charging, migration, or LRU putback. + * is referenced, charged, isolated, and locked: we can't race + * with (un)charging, migration, LRU putback, or anything else + * that would rely on a stable page->mem_cgroup. + * + * Note that lock_page_memcg is a memcg lock, not a page lock, + * to save space. As soon as we switch page->mem_cgroup to a + * new memcg that isn't locked, the above state can change + * concurrently again. Make sure we're truly done with it. */ + smp_mb(); - /* caller should have done css_get */ - page->mem_cgroup = to; + page->mem_cgroup = to; /* caller should have done css_get */ - spin_unlock_irqrestore(&from->move_lock, flags); + __unlock_page_memcg(from); ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, compound, nr_pages); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, compound, -nr_pages); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -5600,9 +5614,9 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) { unsigned long precharge; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); precharge = mc.precharge; mc.precharge = 0; @@ -5885,9 +5899,9 @@ static void mem_cgroup_move_charge(void) atomic_inc(&mc.from->moving_account); synchronize_rcu(); retry: - if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { + if (unlikely(!mmap_read_trylock(mc.mm))) { /* - * Someone who are holding the mmap_sem might be waiting in + * Someone who are holding the mmap_lock might be waiting in * waitq. So we cancel all extra charges, wake up all waiters, * and retry. Because we cancel precharges, we might not be able * to move enough charges, but moving charge is a best-effort @@ -5904,7 +5918,7 @@ retry: walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, NULL); - up_read(&mc.mm->mmap_sem); + mmap_read_unlock(mc.mm); atomic_dec(&mc.from->moving_account); } @@ -6012,7 +6026,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -6029,7 +6044,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; - WRITE_ONCE(memcg->high, high); + page_counter_set_high(&memcg->memory, high); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -6224,7 +6239,6 @@ static struct cftype memory_files[] = { }, { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, { @@ -6427,125 +6441,63 @@ out: } /** - * mem_cgroup_try_charge - try charging a page + * mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode - * @memcgp: charged memcg return - * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. * - * Returns 0 on success, with *@memcgp pointing to the charged memcg. - * Otherwise, an error code is returned. - * - * After page->mapping has been set up, the caller must finalize the - * charge with mem_cgroup_commit_charge(). Or abort the transaction - * with mem_cgroup_cancel_charge() in case page instantiation fails. + * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + unsigned int nr_pages = hpage_nr_pages(page); struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) goto out; if (PageSwapCache(page)) { + swp_entry_t ent = { .val = page_private(page), }; + unsigned short id; + /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which + * already charged pages, too. page->mem_cgroup is protected + * by the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound_head(page)->mem_cgroup) goto out; - if (do_swap_account) { - swp_entry_t ent = { .val = page_private(page), }; - unsigned short id = lookup_swap_cgroup_id(ent); - - rcu_read_lock(); - memcg = mem_cgroup_from_id(id); - if (memcg && !css_tryget_online(&memcg->css)) - memcg = NULL; - rcu_read_unlock(); - } + id = lookup_swap_cgroup_id(ent); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (memcg && !css_tryget_online(&memcg->css)) + memcg = NULL; + rcu_read_unlock(); } if (!memcg) memcg = get_mem_cgroup_from_mm(mm); ret = try_charge(memcg, gfp_mask, nr_pages); + if (ret) + goto out_put; - css_put(&memcg->css); -out: - *memcgp = memcg; - return ret; -} - -int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound) -{ - struct mem_cgroup *memcg; - int ret; - - ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); - memcg = *memcgp; - mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); - return ret; -} - -/** - * mem_cgroup_commit_charge - commit a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @lrucare: page might be on LRU already - * @compound: charge the page as compound or small page - * - * Finalize a charge transaction started by mem_cgroup_try_charge(), - * after page->mapping has been set up. This must happen atomically - * as part of the page instantiation, i.e. under the page table lock - * for anonymous pages, under the page lock for page and swap cache. - * - * In addition, the page must not be on the LRU during the commit, to - * prevent racing with task migration. If it might be, use @lrucare. - * - * Use mem_cgroup_cancel_charge() to cancel the transaction instead. - */ -void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound) -{ - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; - - VM_BUG_ON_PAGE(!page->mapping, page); - VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - commit_charge(page, memcg, lrucare); + commit_charge(page, memcg); local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); + mem_cgroup_charge_statistics(memcg, page, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); - if (do_memsw_account() && PageSwapCache(page)) { + if (PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -6554,42 +6506,18 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, */ mem_cgroup_uncharge_swap(entry, nr_pages); } -} -/** - * mem_cgroup_cancel_charge - cancel a page charge - * @page: page to charge - * @memcg: memcg to charge the page to - * @compound: charge the page as compound or small page - * - * Cancel a charge transaction started by mem_cgroup_try_charge(). - */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, - bool compound) -{ - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; - - if (mem_cgroup_disabled()) - return; - /* - * Swap faults will attempt to charge the same page multiple - * times. But reuse_swap_page() might have removed the page - * from swapcache already, so we can't check PageSwapCache(). - */ - if (!memcg) - return; - - cancel_charge(memcg, nr_pages); +out_put: + css_put(&memcg->css); +out: + return ret; } struct uncharge_gather { struct mem_cgroup *memcg; + unsigned long nr_pages; unsigned long pgpgout; - unsigned long nr_anon; - unsigned long nr_file; unsigned long nr_kmem; - unsigned long nr_huge; - unsigned long nr_shmem; struct page *dummy_page; }; @@ -6600,37 +6528,32 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { - unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(ug->memcg)) { - page_counter_uncharge(&ug->memcg->memory, nr_pages); + page_counter_uncharge(&ug->memcg->memory, ug->nr_pages); if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, nr_pages); + page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); memcg_oom_recover(ug->memcg); } local_irq_save(flags); - __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); - __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); - __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); - __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); if (!mem_cgroup_is_root(ug->memcg)) - css_put_many(&ug->memcg->css, nr_pages); + css_put_many(&ug->memcg->css, ug->nr_pages); } static void uncharge_page(struct page *page, struct uncharge_gather *ug) { + unsigned long nr_pages; + VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && - !PageHWPoison(page) , page); if (!page->mem_cgroup) return; @@ -6649,23 +6572,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) ug->memcg = page->mem_cgroup; } - if (!PageKmemcg(page)) { - unsigned int nr_pages = 1; + nr_pages = compound_nr(page); + ug->nr_pages += nr_pages; - if (PageTransHuge(page)) { - nr_pages = compound_nr(page); - ug->nr_huge += nr_pages; - } - if (PageAnon(page)) - ug->nr_anon += nr_pages; - else { - ug->nr_file += nr_pages; - if (PageSwapBacked(page)) - ug->nr_shmem += nr_pages; - } + if (!PageKmemcg(page)) { ug->pgpgout++; } else { - ug->nr_kmem += compound_nr(page); + ug->nr_kmem += nr_pages; __ClearPageKmemcg(page); } @@ -6702,8 +6615,7 @@ static void uncharge_list(struct list_head *page_list) * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_try_charge() and - * mem_cgroup_commit_charge(). + * Uncharge a page previously charged with mem_cgroup_charge(). */ void mem_cgroup_uncharge(struct page *page) { @@ -6726,7 +6638,7 @@ void mem_cgroup_uncharge(struct page *page) * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + * mem_cgroup_charge(). */ void mem_cgroup_uncharge_list(struct list_head *page_list) { @@ -6779,11 +6691,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, false); + commit_charge(newpage, memcg); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage), - nr_pages); + mem_cgroup_charge_statistics(memcg, newpage, nr_pages); memcg_check_events(memcg, newpage); local_irq_restore(flags); } @@ -6971,7 +6882,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - if (!do_memsw_account()) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; memcg = page->mem_cgroup; @@ -7000,7 +6911,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (memcg != swap_memcg) { + if (!cgroup_memory_noswap && memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7013,8 +6924,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), - -nr_entries); + mem_cgroup_charge_statistics(memcg, page, -nr_entries); memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) @@ -7037,7 +6947,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; memcg = page->mem_cgroup; @@ -7053,7 +6963,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!mem_cgroup_is_root(memcg) && + if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7081,14 +6991,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - if (!do_swap_account) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!mem_cgroup_is_root(memcg)) { + if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7104,7 +7011,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7121,37 +7028,33 @@ bool mem_cgroup_swap_full(struct page *page) if (vm_swap_full()) return true; - if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = page->mem_cgroup; if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= - READ_ONCE(memcg->swap.max)) + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + unsigned long usage = page_counter_read(&memcg->swap); + + if (usage * 2 >= READ_ONCE(memcg->swap.high) || + usage * 2 >= READ_ONCE(memcg->swap.max)) return true; + } return false; } -/* for remember boot option*/ -#ifdef CONFIG_MEMCG_SWAP_ENABLED -static int really_do_swap_account __initdata = 1; -#else -static int really_do_swap_account __initdata; -#endif - -static int __init enable_swap_account(char *s) +static int __init setup_swap_account(char *s) { if (!strcmp(s, "1")) - really_do_swap_account = 1; + cgroup_memory_noswap = 0; else if (!strcmp(s, "0")) - really_do_swap_account = 0; + cgroup_memory_noswap = 1; return 1; } -__setup("swapaccount=", enable_swap_account); +__setup("swapaccount=", setup_swap_account); static u64 swap_current_read(struct cgroup_subsys_state *css, struct cftype *cft) @@ -7161,6 +7064,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } +static int swap_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); +} + +static ssize_t swap_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->swap, high); + + return nbytes; +} + static int swap_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -7188,6 +7114,8 @@ static int swap_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); seq_printf(m, "fail %lu\n", @@ -7203,6 +7131,12 @@ static struct cftype swap_files[] = { .read_u64 = swap_current_read, }, { + .name = "swap.high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_high_show, + .write = swap_high_write, + }, + { .name = "swap.max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = swap_max_show, @@ -7217,7 +7151,7 @@ static struct cftype swap_files[] = { { } /* terminate */ }; -static struct cftype memsw_cgroup_files[] = { +static struct cftype memsw_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), @@ -7246,13 +7180,16 @@ static struct cftype memsw_cgroup_files[] = { static int __init mem_cgroup_swap_init(void) { - if (!mem_cgroup_disabled() && really_do_swap_account) { - do_swap_account = 1; - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, - swap_files)); - WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, - memsw_cgroup_files)); - } + /* No memory control -> no swap control */ + if (mem_cgroup_disabled()) + cgroup_memory_noswap = true; + + if (cgroup_memory_noswap) + return 0; + + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); + return 0; } subsys_initcall(mem_cgroup_swap_init); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a96364be8ab4..47b8ccb1fb9b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -210,14 +210,15 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) { struct task_struct *t = tk->tsk; short addr_lsb = tk->size_shift; - int ret; + int ret = 0; pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pfn, t->comm, t->pid); - if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { - ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, - addr_lsb); + if (flags & MF_ACTION_REQUIRED) { + WARN_ON_ONCE(t != current); + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); } else { /* * Don't use force here, it's convenient if the signal @@ -399,9 +400,15 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk) { struct task_struct *t; - for_each_thread(tsk, t) - if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) - return t; + for_each_thread(tsk, t) { + if (t->flags & PF_MCE_PROCESS) { + if (t->flags & PF_MCE_EARLY) + return t; + } else { + if (sysctl_memory_failure_early_kill) + return t; + } + } return NULL; } @@ -410,21 +417,26 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk) * to be signaled when some page under the process is hwpoisoned. * Return task_struct of the dedicated thread (main thread unless explicitly * specified) if the process is "early kill," and otherwise returns NULL. + * + * Note that the above is true for Action Optional case, but not for Action + * Required case where SIGBUS should sent only to the current thread. */ static struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) { - struct task_struct *t; if (!tsk->mm) return NULL; - if (force_early) - return tsk; - t = find_early_kill_thread(tsk); - if (t) - return t; - if (sysctl_memory_failure_early_kill) - return tsk; - return NULL; + if (force_early) { + /* + * Comparing ->mm here because current task might represent + * a subthread, while tsk always points to the main thread. + */ + if (tsk->mm == current->mm) + return current; + else + return NULL; + } + return find_early_kill_thread(tsk); } /* @@ -1493,7 +1505,7 @@ static void memory_failure_work_func(struct work_struct *work) unsigned long proc_flags; int gotten; - mf_cpu = this_cpu_ptr(&memory_failure_cpu); + mf_cpu = container_of(work, struct memory_failure_cpu, work); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); @@ -1507,6 +1519,19 @@ static void memory_failure_work_func(struct work_struct *work) } } +/* + * Process memory_failure work queued on the specified CPU. + * Used to avoid return-to-userspace racing with the memory_failure workqueue. + */ +void memory_failure_queue_kick(int cpu) +{ + struct memory_failure_cpu *mf_cpu; + + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + cancel_work_sync(&mf_cpu->work); + memory_failure_work_func(&mf_cpu->work); +} + static int __init memory_failure_init(void) { struct memory_failure_cpu *mf_cpu; diff --git a/mm/memory.c b/mm/memory.c index f703fe8c8346..dc7f3543b1fd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -80,7 +80,6 @@ #include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> -#include <asm/pgtable.h> #include "internal.h" @@ -802,8 +801,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; - } else if (pte_devmap(pte)) { - page = pte_page(pte); } out_set_pte: @@ -1188,7 +1185,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is * none or trans huge it can change under us. This is - * because MADV_DONTNEED holds the mmap_sem in read + * because MADV_DONTNEED holds the mmap_lock in read * mode. */ if (pmd_none_or_trans_huge_or_clear_bad(pmd)) @@ -1214,7 +1211,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { - VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma); + mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; @@ -1595,7 +1592,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || end_addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { - BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vma->vm_flags |= VM_MIXEDMAP; } @@ -1639,7 +1636,7 @@ EXPORT_SYMBOL(vm_insert_pages); * The page does not need to be reserved. * * Usually this function is called from f_op->mmap() handler - * under mm->mmap_sem write-lock, so it can change vma->vm_flags. + * under mm->mmap_lock write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. * @@ -1653,7 +1650,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, if (!page_count(page)) return -EINVAL; if (!(vma->vm_flags & VM_MIXEDMAP)) { - BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vma->vm_flags |= VM_MIXEDMAP; } @@ -2436,10 +2433,9 @@ static inline bool cow_user_page(struct page *dst, struct page *src, if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* * Other thread has already handled the fault - * and we don't need to do anything. If it's - * not the case, the fault will be triggered - * again on the same address. + * and update local tlb only */ + update_mmu_tlb(vma, addr, vmf->pte); ret = false; goto pte_unlock; } @@ -2463,13 +2459,14 @@ static inline bool cow_user_page(struct page *dst, struct page *src, vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); locked = true; if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { - /* The PTE changed under us. Retry page fault. */ + /* The PTE changed under us, update local tlb */ + update_mmu_tlb(vma, addr, vmf->pte); ret = false; goto pte_unlock; } /* - * The same page can be mapped back since last copy attampt. + * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { @@ -2576,7 +2573,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) * mapping may be NULL here because some device drivers do not * set page.mapping but still dirty their pages * - * Drop the mmap_sem before waiting on IO, if we can. The file + * Drop the mmap_lock before waiting on IO, if we can. The file * is pinning the mapping, as per above. */ if ((dirtied || page_mkwrite) && mapping) { @@ -2626,7 +2623,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) /* * Handle the case of a page which we actually need to copy to a new page. * - * Called with mmap_sem locked and the old page referenced, but + * Called with mmap_lock locked and the old page referenced, but * without the ptl held. * * High level logic flow: @@ -2647,7 +2644,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct page *new_page = NULL; pte_t entry; int page_copied = 0; - struct mem_cgroup *memcg; struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) @@ -2678,8 +2674,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } } - if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; + cgroup_throttle_swaprate(new_page, GFP_KERNEL); __SetPageUptodate(new_page); @@ -2704,6 +2701,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* * Clear the pte entry and flush it first, before updating the @@ -2713,7 +2711,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2752,7 +2749,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) new_page = old_page; page_copied = 1; } else { - mem_cgroup_cancel_charge(new_page, memcg, false); + update_mmu_tlb(vma, vmf->address, vmf->pte); } if (new_page) @@ -2812,6 +2809,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) * pte_offset_map_lock. */ if (!pte_same(*vmf->pte, vmf->orig_pte)) { + update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } @@ -2889,9 +2887,9 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * - * We enter with non-exclusive mmap_sem (to exclude vma changes, + * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) @@ -2936,6 +2934,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_same(*vmf->pte, vmf->orig_pte)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); unlock_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(vmf->page); @@ -3079,18 +3078,17 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* - * We enter with non-exclusive mmap_sem (to exclude vma changes, + * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * - * We return with the mmap_sem locked or unlocked in the same cases + * We return with the mmap_lock locked or unlocked in the same cases * as does filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; - struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; @@ -3131,10 +3129,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { + int err; + __SetPageLocked(page); __SetPageSwapBacked(page); set_page_private(page, entry.val); - lru_cache_add_anon(page); + + /* Tell memcg to use swap ownership records */ + SetPageSwapCache(page); + err = mem_cgroup_charge(page, vma->vm_mm, + GFP_KERNEL); + ClearPageSwapCache(page); + if (err) + goto out_page; + + lru_cache_add(page); swap_readpage(page, true); } } else { @@ -3195,11 +3204,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - ret = VM_FAULT_OOM; - goto out_page; - } + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. @@ -3247,11 +3252,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } @@ -3287,7 +3290,6 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); @@ -3301,14 +3303,13 @@ out_release: } /* - * We enter with non-exclusive mmap_sem (to exclude vma changes, + * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; struct page *page; vm_fault_t ret = 0; pte_t entry; @@ -3322,10 +3323,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) * pte_offset_map() on pmds where a huge pmd might be created * from a different thread. * - * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when * parallel threads are excluded by other means. * - * Here we only have down_read(mmap_sem). + * Here we only have mmap_read_lock(mm). */ if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; @@ -3341,8 +3342,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_none(*vmf->pte)) + if (!pte_none(*vmf->pte)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; + } ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; @@ -3361,9 +3364,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (!page) goto oom; - if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg, - false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); /* * The memory barrier inside __SetPageUptodate makes sure that @@ -3373,13 +3376,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_none(*vmf->pte)) + if (!pte_none(*vmf->pte)) { + update_mmu_cache(vma, vmf->address, vmf->pte); goto release; + } ret = check_stable_address_space(vma->vm_mm); if (ret) @@ -3388,14 +3394,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3406,7 +3410,6 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - mem_cgroup_cancel_charge(page, memcg, false); put_page(page); goto unlock; oom_free_page: @@ -3416,7 +3419,7 @@ oom: } /* - * The mmap_sem must have been held on entry, and may have been + * The mmap_lock must have been held on entry, and may have been * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ @@ -3611,7 +3614,6 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * mapping. If needed, the fucntion allocates page table or use pre-allocated. * * @vmf: fault environment - * @memcg: memcg to charge page (only for private mappings) * @page: page to map * * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on @@ -3622,8 +3624,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Return: %0 on success, %VM_FAULT_ code in case of error. */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page) +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3631,9 +3632,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - /* THP on COW? */ - VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; @@ -3646,18 +3644,20 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, } /* Re-check under ptl */ - if (unlikely(!pte_none(*vmf->pte))) + if (unlikely(!pte_none(*vmf->pte))) { + update_mmu_tlb(vma, vmf->address, vmf->pte); return VM_FAULT_NOPAGE; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); @@ -3706,7 +3706,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) if (!(vmf->vma->vm_flags & VM_SHARED)) ret = check_stable_address_space(vmf->vma->vm_mm); if (!ret) - ret = alloc_set_pte(vmf, vmf->memcg, page); + ret = alloc_set_pte(vmf, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; @@ -3866,11 +3866,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL, - &vmf->memcg, false)) { + if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } + cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL); ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3888,7 +3888,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); put_page(vmf->cow_page); return ret; } @@ -3929,11 +3928,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) } /* - * We enter with non-exclusive mmap_sem (to exclude vma changes, + * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults). - * The mmap_sem may have been released depending on flags and our + * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). - * If mmap_sem is released, vma may become invalid (for example + * If mmap_lock is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) @@ -4162,10 +4161,10 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow + * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow * concurrent faults). * - * The mmap_sem may have been released depending on flags and our return value. + * The mmap_lock may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) @@ -4187,7 +4186,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) /* * A regular pmd is established and it can't morph into a huge * pmd from under us anymore at this point because we hold the - * mmap_sem read mode and khugepaged takes it in write mode. + * mmap_lock read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ vmf->pte = pte_offset_map(vmf->pmd, vmf->address); @@ -4224,8 +4223,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); entry = vmf->orig_pte; - if (unlikely(!pte_same(*vmf->pte, entry))) + if (unlikely(!pte_same(*vmf->pte, entry))) { + update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; + } if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(vmf); @@ -4253,7 +4254,7 @@ unlock: /* * By the time we get here, we already hold the mm semaphore * - * The mmap_sem may have been released depending on flags and our + * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, @@ -4348,7 +4349,7 @@ retry_pud: /* * By the time we get here, we already hold the mm semaphore * - * The mmap_sem may have been released depending on flags and our + * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, @@ -4434,19 +4435,11 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); -#ifndef __ARCH_HAS_5LEVEL_HACK if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); -#else - if (!pgd_present(*p4d)) { - mm_inc_nr_puds(mm); - pgd_populate(mm, p4d, new); - } else /* Another has populated it */ - pud_free(mm, new); -#endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } @@ -4665,7 +4658,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *old_buf = buf; int write = gup_flags & FOLL_WRITE; - if (down_read_killable(&mm->mmap_sem)) + if (mmap_read_lock_killable(mm)) return 0; /* ignore errors, just check how much was successfully transferred */ @@ -4716,7 +4709,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, buf += bytes; addr += bytes; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return buf - old_buf; } @@ -4773,7 +4766,7 @@ void print_vma_addr(char *prefix, unsigned long ip) /* * we might be running from an atomic context so we cannot sleep */ - if (!down_read_trylock(&mm->mmap_sem)) + if (!mmap_read_trylock(mm)) return; vma = find_vma(mm, ip); @@ -4792,7 +4785,7 @@ void print_vma_addr(char *prefix, unsigned long ip) free_page((unsigned long)buf); } } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) @@ -4800,7 +4793,7 @@ void __might_fault(const char *file, int line) { /* * Some code (nfs/sunrpc) uses socket ops on kernel memory while - * holding the mmap_sem, this is safe because kernel memory doesn't + * holding the mmap_lock, this is safe because kernel memory doesn't * get paged out, therefore we'll never actually fault, and the * below annotations will generate false positives. */ @@ -4811,7 +4804,7 @@ void __might_fault(const char *file, int line) __might_sleep(file, line, 0); #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) - might_lock_read(¤t->mm->mmap_sem); + might_lock_read(¤t->mm->mmap_lock); #endif } EXPORT_SYMBOL(__might_fault); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc0aad0bc1f5..9b34e03e730a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -98,11 +98,14 @@ void mem_hotplug_done(void) u64 max_mem_size = U64_MAX; /* add this memory to iomem resource */ -static struct resource *register_memory_resource(u64 start, u64 size) +static struct resource *register_memory_resource(u64 start, u64 size, + const char *resource_name) { struct resource *res; unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - char *resource_name = "System RAM"; + + if (strcmp(resource_name, "System RAM")) + flags |= IORESOURCE_MEM_DRIVER_MANAGED; /* * Make sure value parsed from 'mem=' only restricts memory adding @@ -862,10 +865,9 @@ static void reset_node_present_pages(pg_data_t *pgdat) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) +static pg_data_t __ref *hotadd_new_pgdat(int nid) { struct pglist_data *pgdat; - unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -879,13 +881,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } else { int cpu; /* - * Reset the nr_zones, order and classzone_idx before reuse. - * Note that kswapd will init kswapd_classzone_idx properly + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly * when it starts in the near future. */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_highest_zoneidx = 0; for_each_online_cpu(cpu) { struct per_cpu_nodestat *p; @@ -895,9 +897,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) } /* we can use NODE_DATA(nid) from here */ - pgdat->node_id = nid; - pgdat->node_start_pfn = start_pfn; + pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); @@ -932,7 +933,6 @@ static void rollback_node_hotadd(int nid) /** * try_online_node - online a node if offlined * @nid: the node ID - * @start: start addr of the node * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * @@ -941,7 +941,7 @@ static void rollback_node_hotadd(int nid) * 0 -> the node is already online * -ENOMEM -> the node could not be allocated */ -static int __try_online_node(int nid, u64 start, bool set_node_online) +static int __try_online_node(int nid, bool set_node_online) { pg_data_t *pgdat; int ret = 1; @@ -949,7 +949,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online) if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid, start); + pgdat = hotadd_new_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -973,7 +973,7 @@ int try_online_node(int nid) int ret; mem_hotplug_begin(); - ret = __try_online_node(nid, 0, true); + ret = __try_online_node(nid, true); mem_hotplug_done(); return ret; } @@ -1017,17 +1017,17 @@ int __ref add_memory_resource(int nid, struct resource *res) if (ret) return ret; + if (!node_possible(nid)) { + WARN(1, "node %d was absent from the node_possible_map\n", nid); + return -EINVAL; + } + mem_hotplug_begin(); - /* - * Add new range to memblock so that when hotadd_new_pgdat() is called - * to allocate new pgdat, get_pfn_range_for_nid() will be able to find - * this new range and calculate total pages correctly. The range will - * be removed at hot-remove time. - */ - memblock_add_node(start, size, nid); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_add_node(start, size, nid); - ret = __try_online_node(nid, start, false); + ret = __try_online_node(nid, false); if (ret < 0) goto error; new_node = ret; @@ -1060,7 +1060,8 @@ int __ref add_memory_resource(int nid, struct resource *res) BUG_ON(ret); /* create new memmap entry */ - firmware_map_add_hotplug(start, start + size, "System RAM"); + if (!strcmp(res->name, "System RAM")) + firmware_map_add_hotplug(start, start + size, "System RAM"); /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); @@ -1074,7 +1075,8 @@ error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); - memblock_remove(start, size); + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); mem_hotplug_done(); return ret; } @@ -1085,7 +1087,7 @@ int __ref __add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - res = register_memory_resource(start, size); + res = register_memory_resource(start, size, "System RAM"); if (IS_ERR(res)) return PTR_ERR(res); @@ -1107,82 +1109,57 @@ int add_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(add_memory); -#ifdef CONFIG_MEMORY_HOTREMOVE /* - * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy - * set and the size of the free page is given by page_order(). Using this, - * the function determines if the pageblock contains only free pages. - * Due to buddy contraints, a free page at least the size of a pageblock will - * be located at the start of the pageblock + * Add special, driver-managed memory to the system as system RAM. Such + * memory is not exposed via the raw firmware-provided memmap as system + * RAM, instead, it is detected and added by a driver - during cold boot, + * after a reboot, and after kexec. + * + * Reasons why this memory should not be used for the initial memmap of a + * kexec kernel or for placing kexec images: + * - The booting kernel is in charge of determining how this memory will be + * used (e.g., use persistent memory as system RAM) + * - Coordination with a hypervisor is required before this memory + * can be used (e.g., inaccessible parts). + * + * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided + * memory map") are created. Also, the created memory resource is flagged + * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case + * this memory as well (esp., not place kexec images onto it). + * + * The resource_name (visible via /proc/iomem) has to have the format + * "System RAM ($DRIVER)". */ -static inline int pageblock_free(struct page *page) +int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name) { - return PageBuddy(page) && page_order(page) >= pageblock_order; -} + struct resource *res; + int rc; -/* Return the pfn of the start of the next active pageblock after a given pfn */ -static unsigned long next_active_pageblock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); + if (!resource_name || + strstr(resource_name, "System RAM (") != resource_name || + resource_name[strlen(resource_name) - 1] != ')') + return -EINVAL; - /* Ensure the starting page is pageblock-aligned */ - BUG_ON(pfn & (pageblock_nr_pages - 1)); + lock_device_hotplug(); - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) { - int order; - /* be careful. we don't have locks, page_order can be changed.*/ - order = page_order(page); - if ((order < MAX_ORDER) && (order >= pageblock_order)) - return pfn + (1 << order); + res = register_memory_resource(start, size, resource_name); + if (IS_ERR(res)) { + rc = PTR_ERR(res); + goto out_unlock; } - return pfn + pageblock_nr_pages; -} - -static bool is_pageblock_removable_nolock(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - struct zone *zone; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, - MEMORY_OFFLINE); -} - -/* Checks if this range of memory is likely to be hot-removable. */ -bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long end_pfn, pfn; - - end_pfn = min(start_pfn + nr_pages, - zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); - - /* Check the starting page of each pageblock within the range */ - for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { - if (!is_pageblock_removable_nolock(pfn)) - return false; - cond_resched(); - } + rc = add_memory_resource(nid, res); + if (rc < 0) + release_memory_resource(res); - /* All pageblocks in the memory block are likely to be hot-removable */ - return true; +out_unlock: + unlock_device_hotplug(); + return rc; } +EXPORT_SYMBOL_GPL(add_memory_driver_managed); +#ifdef CONFIG_MEMORY_HOTREMOVE /* * Confirm all pages in a range [start, end) belong to the same zone (skipping * memory holes). When true, return the zone. @@ -1224,11 +1201,17 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn, /* * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, - * non-lru movable pages and hugepages). We scan pfn because it's much - * easier than scanning over linked list. This function returns the pfn - * of the first found movable page if it's found, otherwise 0. + * non-lru movable pages and hugepages). Will skip over most unmovable + * pages (esp., pages that can be skipped when offlining), but bail out on + * definitely unmovable pages. + * + * Returns: + * 0 in case a movable page is found and movable_pfn was updated. + * -ENOENT in case no movable page was found. + * -EBUSY in case a definitely unmovable page was found. */ -static unsigned long scan_movable_pages(unsigned long start, unsigned long end) +static int scan_movable_pages(unsigned long start, unsigned long end, + unsigned long *movable_pfn) { unsigned long pfn; @@ -1240,18 +1223,30 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) continue; page = pfn_to_page(pfn); if (PageLRU(page)) - return pfn; + goto found; if (__PageMovable(page)) - return pfn; + goto found; + + /* + * PageOffline() pages that are not marked __PageMovable() and + * have a reference count > 0 (after MEM_GOING_OFFLINE) are + * definitely unmovable. If their reference count would be 0, + * they could at least be skipped when offlining memory. + */ + if (PageOffline(page) && page_count(page)) + return -EBUSY; if (!PageHuge(page)) continue; head = compound_head(page); if (page_huge_active(head)) - return pfn; + goto found; skip = compound_nr(head) - (page - head); pfn += skip - 1; } + return -ENOENT; +found: + *movable_pfn = pfn; return 0; } @@ -1360,7 +1355,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, } /* - * Check all pages in range, recoreded as memory resource, are isolated. + * Check all pages in range, recorded as memory resource, are isolated. */ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, @@ -1372,11 +1367,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; -#else - pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); -#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); @@ -1518,7 +1509,8 @@ static int __ref __offline_pages(unsigned long start_pfn, } do { - for (pfn = start_pfn; pfn;) { + pfn = start_pfn; + do { if (signal_pending(current)) { ret = -EINTR; reason = "signal backoff"; @@ -1528,14 +1520,19 @@ static int __ref __offline_pages(unsigned long start_pfn, cond_resched(); lru_add_drain_all(); - pfn = scan_movable_pages(pfn, end_pfn); - if (pfn) { + ret = scan_movable_pages(pfn, end_pfn, &pfn); + if (!ret) { /* * TODO: fatal migration failures should bail * out */ do_migrate_range(pfn, end_pfn); } + } while (!ret); + + if (ret != -ENOENT) { + reason = "unmovable page"; + goto failed_removal_isolated; } /* @@ -1750,8 +1747,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); arch_remove_memory(nid, start, size, NULL); - memblock_free(start, size); - memblock_remove(start, size); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + memblock_free(start, size); + memblock_remove(start, size); + } + __release_memory_resource(start, size); try_offline_node(nid); @@ -1797,4 +1798,41 @@ int remove_memory(int nid, u64 start, u64 size) return rc; } EXPORT_SYMBOL_GPL(remove_memory); + +/* + * Try to offline and remove a memory block. Might take a long time to + * finish in case memory is still in use. Primarily useful for memory devices + * that logically unplugged all memory (so it's no longer in use) and want to + * offline + remove the memory block. + */ +int offline_and_remove_memory(int nid, u64 start, u64 size) +{ + struct memory_block *mem; + int rc = -EINVAL; + + if (!IS_ALIGNED(start, memory_block_size_bytes()) || + size != memory_block_size_bytes()) + return rc; + + lock_device_hotplug(); + mem = find_memory_block(__pfn_to_section(PFN_DOWN(start))); + if (mem) + rc = device_offline(&mem->dev); + /* Ignore if the device is already offline. */ + if (rc > 0) + rc = 0; + + /* + * In case we succeeded to offline the memory block, remove it. + * This cannot fail as it cannot get onlined in the meantime. + */ + if (!rc) { + rc = try_remove_memory(nid, start, size); + WARN_ON_ONCE(rc); + } + unlock_device_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(offline_and_remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 48ba9729062e..381320671677 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -224,7 +224,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * handle an empty nodemask with MPOL_PREFERRED here. * * Must be called holding task's alloc_lock to protect task's mems_allowed - * and mempolicy. May also be called holding the mmap_semaphore for write. + * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) @@ -368,7 +368,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol, /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * - * Per-vma policies are protected by mmap_sem. Allocations using per-task + * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ @@ -398,17 +398,17 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) /* * Rebind each vma in mm to new nodemask. * - * Call holding a reference to mm. Takes mm->mmap_sem during call. + * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) mpol_rebind_policy(vma->vm_policy, new); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { @@ -764,7 +764,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, /* * Apply policy to a single VMA - * This must be called with the mmap_sem held for writing. + * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) @@ -789,7 +789,7 @@ static int vma_replace_policy(struct vm_area_struct *vma, } old = vma->vm_policy; - vma->vm_policy = new; /* protected by mmap_sem */ + vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; @@ -927,15 +927,12 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) int locked = 1; err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); - if (err == 0) { - /* E.g. GUP interrupted by fatal signal */ - err = -EFAULT; - } else if (err > 0) { + if (err > 0) { err = page_to_nid(p); put_page(p); } if (locked) - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; } @@ -968,10 +965,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma_intersection(mm, addr, addr+1); if (!vma) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return -EFAULT; } if (vma->vm_ops && vma->vm_ops->get_policy) @@ -988,7 +985,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, lookup_node() - * wil drop the mmap_sem, so after calling + * wil drop the mmap_lock, so after calling * lookup_node() only "pol" remains valid, "vma" * is stale. */ @@ -1030,7 +1027,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, out: mpol_cond_put(pol); if (vma) - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; @@ -1139,7 +1136,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (err) return err; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' @@ -1220,7 +1217,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (err < 0) break; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (err < 0) return err; return busy; @@ -1343,12 +1340,12 @@ static long do_mbind(unsigned long start, unsigned long len, { NODEMASK_SCRATCH(scratch); if (scratch) { - down_write(&mm->mmap_sem); + mmap_write_lock(mm); task_lock(current); err = mpol_set_nodemask(new, nmask, scratch); task_unlock(current); if (err) - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); @@ -1385,7 +1382,7 @@ up_out: putback_movable_pages(&pagelist); } - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); mpol_out: mpol_put(new); return err; @@ -2188,7 +2185,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. - * When VMA is not NULL caller must hold down_read on the mmap_sem of the + * When VMA is not NULL caller must read-lock the mmap_lock of the * mm_struct of the VMA to prevent it from going away. Should be used for * all allocations for pages that will be mapped into user space. Returns * NULL when no page can be allocated. diff --git a/mm/migrate.c b/mm/migrate.c index 7160c1556f79..f37729673558 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -490,11 +490,18 @@ int migrate_page_move_mapping(struct address_space *mapping, * are mapped to swap space. */ if (newzone != oldzone) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); - __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); + struct lruvec *old_lruvec, *new_lruvec; + struct mem_cgroup *memcg; + + memcg = page_memcg(page); + old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); + new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); + + __dec_lruvec_state(old_lruvec, NR_FILE_PAGES); + __inc_lruvec_state(new_lruvec, NR_FILE_PAGES); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); - __inc_node_state(newzone->zone_pgdat, NR_SHMEM); + __dec_lruvec_state(old_lruvec, NR_SHMEM); + __inc_lruvec_state(new_lruvec, NR_SHMEM); } if (dirty && mapping_cap_account_dirty(mapping)) { __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); @@ -797,11 +804,7 @@ recheck_buffers: if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); + attach_page_private(newpage, detach_page_private(page)); bh = head; do { @@ -810,8 +813,6 @@ recheck_buffers: } while (bh != head); - SetPagePrivate(newpage); - if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else @@ -1032,7 +1033,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. - * mpage_readpages). If an allocation happens for the + * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, @@ -1554,7 +1555,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, unsigned int follflags; int err; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); err = -EFAULT; vma = find_vma(mm, addr); if (!vma || addr < vma->vm_start || !vma_migratable(vma)) @@ -1607,7 +1608,7 @@ out_putpage: */ put_page(page); out: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; } @@ -1732,7 +1733,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, { unsigned long i; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); @@ -1759,7 +1760,7 @@ set_status: status++; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } /* @@ -2119,7 +2120,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * pmd before doing set_pmd_at(), nor to flush the TLB after * set_pmd_at(). Clearing the pmd here would introduce a race * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the - * mmap_sem for reading. If the pmd is set to NULL at any given time, + * mmap_lock for reading. If the pmd is set to NULL at any given time, * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this * pmd. */ @@ -2674,7 +2675,7 @@ restore: * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. * * It is safe to update device page table after migrate_vma_pages() because - * both destination and source page are still locked, and the mmap_sem is held + * both destination and source page are still locked, and the mmap_lock is held * in read mode (hence no one can unmap the range being migrated). * * Once the caller is done cleaning up things and updating its page table (if it @@ -2739,7 +2740,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, { struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; - struct mem_cgroup *memcg; bool flush = false; spinlock_t *ptl; pte_t entry; @@ -2772,10 +2772,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, * pte_offset_map() on pmds where a huge pmd might be created * from a different thread. * - * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when * parallel threads are excluded by other means. * - * Here we only have down_read(mmap_sem). + * Here we only have mmap_read_lock(mm). */ if (pte_alloc(mm, pmdp)) goto abort; @@ -2786,7 +2786,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (unlikely(anon_vma_prepare(vma))) goto abort; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto abort; /* @@ -2832,7 +2832,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); if (!is_zone_device_page(page)) lru_cache_add_active_or_unevictable(page, vma); get_page(page); @@ -2854,7 +2853,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, unlock_abort: pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); abort: *src &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/mincore.c b/mm/mincore.c index 0e6dd9948f1a..453ff112470f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -17,9 +17,9 @@ #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/hugetlb.h> +#include <linux/pgtable.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -284,9 +284,9 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, * Do at most PAGE_SIZE entries per iteration, due to * the temporary buffer size. */ - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (retval <= 0) break; diff --git a/mm/mlock.c b/mm/mlock.c index a72c1eeded77..f8736136fad7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -49,7 +49,7 @@ EXPORT_SYMBOL(can_do_mlock); * When lazy mlocking via vmscan, it is important to ensure that the * vma's VM_LOCKED status is not concurrently being modified, otherwise we * may have mlocked a page that is being munlocked. So lazy mlock must take - * the mmap_sem for read, and verify that the vma really is locked + * the mmap_lock for read, and verify that the vma really is locked * (see mm/rmap.c). */ @@ -381,7 +381,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, /* * Initialize pte walk starting at the already pinned page where we * are sure that there is a pte, as it was pinned under the same - * mmap_sem write op. + * mmap_lock write op. */ pte = get_locked_pte(vma->vm_mm, start, &ptl); /* Make sure we do not cross the page table boundary */ @@ -565,7 +565,7 @@ success: mm->locked_vm += nr_pages; /* - * vm_flags is protected by the mmap_sem held in write mode. + * vm_flags is protected by the mmap_lock held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ @@ -686,7 +686,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(current->mm)) return -EINTR; locked += current->mm->locked_vm; @@ -705,7 +705,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = apply_vma_lock_flags(start, len, flags); - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); if (error) return error; @@ -742,10 +742,10 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_vma_lock_flags(start, len, 0); - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); return ret; } @@ -811,14 +811,14 @@ SYSCALL_DEFINE1(mlockall, int, flags) lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); @@ -829,10 +829,10 @@ SYSCALL_DEFINE0(munlockall) { int ret; - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_mlockall_flags(0); - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); return ret; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 7da6991d9435..435e5f794b3b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -67,26 +67,30 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_CPUPID_SHIFT); + LAST_CPUPID_SHIFT, + KASAN_TAG_WIDTH); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", + "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_CPUPID_PGSHIFT); + (unsigned long)LAST_CPUPID_PGSHIFT, + (unsigned long)KASAN_TAG_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), diff --git a/mm/mmap.c b/mm/mmap.c index f609e9ec4a25..59a4682ebf3f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -132,7 +132,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } - /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ + /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } @@ -198,7 +198,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; origbrk = mm->brk; @@ -238,14 +238,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * __do_munmap() may downgrade mmap_sem to read. + * __do_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; /* - * mm->brk must to be protected by write mmap_sem so update it - * before downgrading mmap_sem. When __do_munmap() fails, + * mm->brk must to be protected by write mmap_lock so update it + * before downgrading mmap_lock. When __do_munmap() fails, * mm->brk will be restored from origbrk. */ mm->brk = brk; @@ -272,9 +272,9 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) success: populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; if (downgraded) - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); else - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); @@ -282,7 +282,7 @@ success: out: retval = origbrk; - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return retval; } @@ -505,7 +505,7 @@ static __always_inline void vma_rb_erase(struct vm_area_struct *vma, * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * - * The entire update must be protected by exclusive mmap_sem and by + * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static inline void @@ -1207,7 +1207,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } /* - * Rough compatbility check to quickly see if it's even worth looking + * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ @@ -1361,7 +1361,7 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode, } /* - * The caller must hold down_write(¤t->mm->mmap_sem). + * The caller must write-lock current->mm->mmap_lock. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -2371,7 +2371,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the + * is required to hold the mmap_lock in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ anon_vma_lock_write(vma->anon_vma); @@ -2389,7 +2389,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (!error) { /* * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_sem + * updates, but we only hold a shared mmap_lock * lock here, so we need to protect against * concurrent vma expansions. * anon_vma_lock_write() doesn't help here, as @@ -2451,7 +2451,7 @@ int expand_downwards(struct vm_area_struct *vma, /* * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the + * is required to hold the mmap_lock in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ anon_vma_lock_write(vma->anon_vma); @@ -2469,7 +2469,7 @@ int expand_downwards(struct vm_area_struct *vma, if (!error) { /* * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_sem + * updates, but we only hold a shared mmap_lock * lock here, so we need to protect against * concurrent vma expansions. * anon_vma_lock_write() doesn't help here, as @@ -2828,7 +2828,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, detach_vmas_to_be_unmapped(mm, vma, prev, end); if (downgrade) - downgrade_write(&mm->mmap_sem); + mmap_write_downgrade(mm); unmap_region(mm, vma, prev, start, end); @@ -2850,20 +2850,20 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) struct mm_struct *mm = current->mm; LIST_HEAD(uf); - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; ret = __do_munmap(mm, start, len, &uf, downgrade); /* - * Returning 1 indicates mmap_sem is downgraded. + * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset * it to 0 before return. */ if (ret == 1) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); ret = 0; } else - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); return ret; @@ -2911,7 +2911,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; vma = find_vma(mm, start); @@ -2974,7 +2974,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, prot, flags, pgoff, &populate, NULL); fput(file); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) @@ -3074,12 +3074,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (!len) return 0; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_brk_flags(addr, len, flags, &uf); populate = ((mm->def_flags & VM_LOCKED) != 0); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); @@ -3107,12 +3107,12 @@ void exit_mmap(struct mm_struct *mm) /* * Manually reap the mm to free as much memory as possible. * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_sem for + * this mm from further consideration. Taking mm->mmap_lock for * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_sem is + * reaper will not run on this mm again after mmap_lock is * dropped. * - * Nothing can be holding mm->mmap_sem here and the above call + * Nothing can be holding mm->mmap_lock here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. * @@ -3123,8 +3123,8 @@ void exit_mmap(struct mm_struct *mm) (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); + mmap_write_lock(mm); + mmap_write_unlock(mm); } if (mm->locked_vm) { @@ -3437,7 +3437,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma, } /* - * Called with mm->mmap_sem held for writing. + * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. @@ -3474,7 +3474,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares @@ -3504,7 +3504,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); } } @@ -3513,11 +3513,11 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * - * The caller must take the mmap_sem in write mode before calling + * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the - * mmap_sem until mm_drop_all_locks() returns. + * mmap_lock until mm_drop_all_locks() returns. * - * mmap_sem in write mode is required in order to block all operations + * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. @@ -3550,7 +3550,7 @@ int mm_take_all_locks(struct mm_struct *mm) struct vm_area_struct *vma; struct anon_vma_chain *avc; - BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(mmap_read_trylock(mm)); mutex_lock(&mm_all_locks_mutex); @@ -3622,7 +3622,7 @@ static void vm_unlock_mapping(struct address_space *mapping) } /* - * The mmap_sem cannot be released by the caller until + * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) @@ -3630,7 +3630,7 @@ void mm_drop_all_locks(struct mm_struct *mm) struct vm_area_struct *vma; struct anon_vma_chain *avc; - BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(mmap_read_trylock(mm)); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for (vma = mm->mmap; vma; vma = vma->vm_next) { diff --git a/mm/mmu_context.c b/mm/mmu_context.c deleted file mode 100644 index 3e612ae748e9..000000000000 --- a/mm/mmu_context.c +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (C) 2009 Red Hat, Inc. - * - * See ../COPYING for licensing terms. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/sched/mm.h> -#include <linux/sched/task.h> -#include <linux/mmu_context.h> -#include <linux/export.h> - -#include <asm/mmu_context.h> - -/* - * use_mm - * Makes the calling kernel thread take on the specified - * mm context. - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -void use_mm(struct mm_struct *mm) -{ - struct mm_struct *active_mm; - struct task_struct *tsk = current; - - task_lock(tsk); - active_mm = tsk->active_mm; - if (active_mm != mm) { - mmgrab(mm); - tsk->active_mm = mm; - } - tsk->mm = mm; - switch_mm(active_mm, mm, tsk); - task_unlock(tsk); -#ifdef finish_arch_post_lock_switch - finish_arch_post_lock_switch(); -#endif - - if (active_mm != mm) - mmdrop(active_mm); -} -EXPORT_SYMBOL_GPL(use_mm); - -/* - * unuse_mm - * Reverses the effect of use_mm, i.e. releases the - * specified mm context which was earlier taken on - * by the calling kernel thread - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -void unuse_mm(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - sync_mm_rss(mm); - tsk->mm = NULL; - /* active_mm is still 'mm' */ - enter_lazy_tlb(mm, tsk); - task_unlock(tsk); -} -EXPORT_SYMBOL_GPL(unuse_mm); diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index a3538cb2bcbe..03c33c93a582 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -301,7 +301,7 @@ void tlb_finish_mmu(struct mmu_gather *tlb, { /* * If there are parallel threads are doing PTE changes on same range - * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB + * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs * and result in having stale TLB entries. So flush TLB forcefully * if we detect parallel PTE batching threads. diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 06852b896fa6..352bb9f3ecc0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -599,7 +599,7 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } /* - * Same as mmu_notifier_register but here the caller must hold the mmap_sem in + * Same as mmu_notifier_register but here the caller must hold the mmap_lock in * write mode. A NULL mn signals the notifier is being registered for itree * mode. */ @@ -609,7 +609,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, struct mmu_notifier_subscriptions *subscriptions = NULL; int ret; - lockdep_assert_held_write(&mm->mmap_sem); + mmap_assert_write_locked(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); if (IS_ENABLED(CONFIG_LOCKDEP)) { @@ -623,7 +623,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, /* * kmalloc cannot be called under mm_take_all_locks(), but we * know that mm->notifier_subscriptions can't change while we - * hold the write side of the mmap_sem. + * hold the write side of the mmap_lock. */ subscriptions = kzalloc( sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL); @@ -655,7 +655,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, * readers. acquire can only be used while holding the mmgrab or * mmget, and is safe because once created the * mmu_notifier_subscriptions is not freed until the mm is destroyed. - * As above, users holding the mmap_sem or one of the + * As above, users holding the mmap_lock or one of the * mm_take_all_locks() do not need to use acquire semantics. */ if (subscriptions) @@ -689,7 +689,7 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register); * @mn: The notifier to attach * @mm: The mm to attach the notifier to * - * Must not hold mmap_sem nor any other VM related lock when calling + * Must not hold mmap_lock nor any other VM related lock when calling * this registration function. Must also ensure mm_users can't go down * to zero while this runs to avoid races with mmu_notifier_release, * so mm has to be current->mm or the mm should be pinned safely such @@ -708,9 +708,9 @@ int mmu_notifier_register(struct mmu_notifier *subscription, { int ret; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); ret = __mmu_notifier_register(subscription, mm); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL_GPL(mmu_notifier_register); @@ -750,7 +750,7 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) * are the same. * * Each call to mmu_notifier_get() must be paired with a call to - * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem. + * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock. * * While the caller has a mmu_notifier get the mm pointer will remain valid, * and can be converted to an active mm pointer via mmget_not_zero(). @@ -761,7 +761,7 @@ struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mmu_notifier *subscription; int ret; - lockdep_assert_held_write(&mm->mmap_sem); + mmap_assert_write_locked(mm); if (mm->notifier_subscriptions) { subscription = find_get_mmu_notifier(mm, ops); @@ -983,7 +983,7 @@ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mmu_notifier_subscriptions *subscriptions; int ret; - might_lock(&mm->mmap_sem); + might_lock(&mm->mmap_lock); subscriptions = smp_load_acquire(&mm->notifier_subscriptions); if (!subscriptions || !subscriptions->has_itree) { @@ -1006,7 +1006,7 @@ int mmu_interval_notifier_insert_locked( mm->notifier_subscriptions; int ret; - lockdep_assert_held_write(&mm->mmap_sem); + mmap_assert_write_locked(mm); if (!subscriptions || !subscriptions->has_itree) { ret = __mmu_notifier_register(NULL, mm); diff --git a/mm/mprotect.c b/mm/mprotect.c index 494192ca954b..ce8b8a5eacbb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -28,7 +28,7 @@ #include <linux/ksm.h> #include <linux/uaccess.h> #include <linux/mm_inline.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -49,7 +49,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* - * Can be called with only the mmap_sem for reading by + * Can be called with only the mmap_lock for reading by * prot_numa so we must check the pmd isn't constantly * changing from under us from pmd_none to pmd_trans_huge * and/or the other way around. @@ -59,7 +59,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* * The pmd points to a regular pte so the pmd can't change - * from under us even if the mmap_sem is only hold for + * from under us even if the mmap_lock is only hold for * reading. */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -228,7 +228,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, next = pmd_addr_end(addr, end); /* - * Automatic NUMA balancing walks the tables with mmap_sem + * Automatic NUMA balancing walks the tables with mmap_lock * held for read. It's possible a parallel update to occur * between pmd_trans_huge() and a pmd_none_or_clear_bad() * check leading to a false positive and clearing. @@ -477,7 +477,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, success: /* - * vm_flags and vm_page_prot are protected by the mmap_sem + * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ vma->vm_flags = newflags; @@ -538,7 +538,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, reqprot = prot; - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(current->mm)) return -EINTR; /* @@ -628,7 +628,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, prot = reqprot; } out: - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); return error; } @@ -658,7 +658,7 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) if (init_val & ~PKEY_ACCESS_MASK) return -EINVAL; - down_write(¤t->mm->mmap_sem); + mmap_write_lock(current->mm); pkey = mm_pkey_alloc(current->mm); ret = -ENOSPC; @@ -672,7 +672,7 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) } ret = pkey; out: - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); return ret; } @@ -680,9 +680,9 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) { int ret; - down_write(¤t->mm->mmap_sem); + mmap_write_lock(current->mm); ret = mm_pkey_free(current->mm, pkey); - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); /* * We could provie warnings or errors if any VMA still diff --git a/mm/mremap.c b/mm/mremap.c index a7e282ead438..5dd572d57ca9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -146,7 +146,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, /* * We don't have to worry about the ordering of src and dst - * pte locks because exclusive mmap_sem prevents deadlock. + * pte locks because exclusive mmap_lock prevents deadlock. */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); new_pte = pte_offset_map(new_pmd, new_addr); @@ -213,7 +213,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, /* * We don't have to worry about the ordering of src and dst - * ptlocks because exclusive mmap_sem prevents deadlock. + * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = pmd_lock(vma->vm_mm, old_pmd); new_ptl = pmd_lockptr(mm, new_pmd); @@ -266,7 +266,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; /* See comment in move_ptes() */ @@ -413,9 +413,20 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Always put back VM_ACCOUNT since we won't unmap */ vma->vm_flags |= VM_ACCOUNT; - vm_acct_memory(vma_pages(new_vma)); + vm_acct_memory(new_len >> PAGE_SHIFT); } + /* + * VMAs can actually be merged back together in copy_vma + * calling merge_vma. This can happen with anonymous vmas + * which have not yet been faulted, so if we were to consider + * this VMA split we'll end up adding VM_ACCOUNT on the + * next VMA, which is completely unrelated if this VMA + * was re-merged. + */ + if (split && new_vma == vma) + split = 0; + /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; @@ -685,7 +696,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (!new_len) return ret; - if (down_write_killable(¤t->mm->mmap_sem)) + if (mmap_write_lock_killable(current->mm)) return -EINTR; if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) { @@ -699,7 +710,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. * __do_munmap does all the needed commit accounting, and - * downgrades mmap_sem to read if so directed. + * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; @@ -709,7 +720,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (retval < 0 && old_len != new_len) { ret = retval; goto out; - /* Returning 1 indicates mmap_sem is downgraded to read. */ + /* Returning 1 indicates mmap_lock is downgraded to read. */ } else if (retval == 1) downgraded = true; ret = addr; @@ -774,16 +785,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, out: if (offset_in_page(ret)) { vm_unacct_memory(charged); - locked = 0; + locked = false; } if (downgraded) - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); else - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); userfaultfd_unmap_complete(mm, &uf_unmap_early); - mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); + mremap_userfaultfd_complete(&uf, addr, ret, old_len); userfaultfd_unmap_complete(mm, &uf_unmap); return ret; } diff --git a/mm/msync.c b/mm/msync.c index c3bd3e75f687..69c6d2029531 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -57,7 +57,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, start); for (;;) { struct file *file; @@ -88,12 +88,12 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, start); } else { if (start >= end) { @@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) } } out_unlock: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out: return error ? : unmapped_error; } diff --git a/mm/nommu.c b/mm/nommu.c index 318df4e236c9..cdcad5d61dd1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,7 +140,7 @@ void vfree(const void *addr) } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() @@ -150,24 +150,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) { - return __vmalloc(size, flags, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); +} + +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) +{ + return __vmalloc(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, flags, PAGE_KERNEL); + ret = __vmalloc(size, flags); if (ret) { struct vm_area_struct *vma; - down_write(¤t->mm->mmap_sem); + mmap_write_lock(current->mm); vma = find_vma(current->mm, (unsigned long)ret); if (vma) vma->vm_flags |= VM_USERMAP; - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); } return ret; @@ -179,12 +188,6 @@ void *vmalloc_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_user); -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_user_flags(size, flags | __GFP_ZERO); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); @@ -230,7 +233,7 @@ long vwrite(char *buf, char *addr, unsigned long count) */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -248,8 +251,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -302,7 +304,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } /** @@ -314,7 +316,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_32); @@ -351,7 +353,7 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { BUG(); return NULL; @@ -369,18 +371,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -/* - * Implement a stub for vmalloc_sync_[un]mapping() if the architecture - * chose not to have one. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); @@ -443,7 +433,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ - flush_icache_range(mm->brk, brk); + flush_icache_user_range(mm->brk, brk); return mm->brk = brk; } @@ -592,7 +582,7 @@ static void put_nommu_region(struct vm_region *region) * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous * page - * - should be called with mm->mmap_sem held writelocked + * - should be called with mm->mmap_lock held writelocked */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -706,7 +696,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) /* * look up the first VMA in which addr resides, NULL if none - * - should be called with mm->mmap_sem at least held readlocked + * - should be called with mm->mmap_lock at least held readlocked */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { @@ -752,7 +742,7 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) /* * look up the first VMA exactly that exactly matches addr - * - should be called with mm->mmap_sem at least held readlocked + * - should be called with mm->mmap_lock at least held readlocked */ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long addr, @@ -1287,7 +1277,7 @@ share: /* we flush the region from the icache only when the first executable * mapping of it is made */ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { - flush_icache_range(region->vm_start, region->vm_end); + flush_icache_user_range(region->vm_start, region->vm_end); region->vm_icache_flushed = true; } @@ -1552,9 +1542,9 @@ int vm_munmap(unsigned long addr, size_t len) struct mm_struct *mm = current->mm; int ret; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); ret = do_munmap(mm, addr, len, NULL); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(vm_munmap); @@ -1641,9 +1631,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, { unsigned long ret; - down_write(¤t->mm->mmap_sem); + mmap_write_lock(current->mm); ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); return ret; } @@ -1715,7 +1705,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; - if (down_read_killable(&mm->mmap_sem)) + if (mmap_read_lock_killable(mm)) return 0; /* the access must start within one of the target process's mappings */ @@ -1738,7 +1728,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, len = 0; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return len; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dfc357614e56..6e94962893ee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -126,7 +126,7 @@ static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) /* * The process p may have detached its own ->mm while exiting or through - * use_mm(), but one or more of its subthreads may still have a valid + * kthread_use_mm(), but one or more of its subthreads may still have a valid * pointer. Return p, or any of its subthreads with a valid ->mm, with * task_lock() held. */ @@ -254,7 +254,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; - enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); + enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); bool cpuset_limited = false; int nid; @@ -294,7 +294,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) /* Check this allocation failure is caused by cpuset's wall function */ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, - high_zoneidx, oc->nodemask) + highest_zoneidx, oc->nodemask) if (!cpuset_zone_allowed(zone, oc->gfp_mask)) cpuset_limited = true; @@ -569,7 +569,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { bool ret = true; - if (!down_read_trylock(&mm->mmap_sem)) { + if (!mmap_read_trylock(mm)) { trace_skip_task_reaping(tsk->pid); return false; } @@ -577,8 +577,8 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) /* * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't * work on the mm anymore. The check for MMF_OOM_SKIP must run - * under mmap_sem for reading because it serializes against the - * down_write();up_write() cycle in exit_mmap(). + * under mmap_lock for reading because it serializes against the + * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). */ if (test_bit(MMF_OOM_SKIP, &mm->flags)) { trace_skip_task_reaping(tsk->pid); @@ -600,7 +600,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) out_finish: trace_finish_task_reaping(tsk->pid); out_unlock: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return ret; } @@ -611,7 +611,7 @@ static void oom_reap_task(struct task_struct *tsk) int attempts = 0; struct mm_struct *mm = tsk->signal->oom_mm; - /* Retry the down_read_trylock(mmap_sem) a few times */ + /* Retry the mmap_read_trylock(mm) a few times */ while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); @@ -629,7 +629,7 @@ done: /* * Hide this mm from OOM killer because it has been either reaped or - * somebody can't call up_write(mmap_sem). + * somebody can't call mmap_write_unlock(mm). */ set_bit(MMF_OOM_SKIP, &mm->flags); @@ -898,7 +898,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) /* * Kill all user processes sharing victim->mm in other thread groups, if * any. They don't get access to memory reserves, though, to avoid - * depletion of all memory. This prevents mm->mmap_sem livelock when an + * depletion of all memory. This prevents mm->mmap_lock livelock when an * oom killed thread cannot exit because it requires the semaphore and * its contended by another thread trying to allocate memory itself. * That thread will now get access to memory reserves since it has a @@ -919,8 +919,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; } /* - * No use_mm() user needs to read from the userspace so we are - * ok to reap it. + * No kthead_use_mm() user needs to read from the userspace so + * we are ok to reap it. */ if (unlikely(p->flags & PF_KTHREAD)) continue; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..28b3e7a67565 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -257,7 +257,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, * requiring writeback. * * This number of dirtyable pages is the base value of which the - * user-configurable dirty ratio is the effictive number of pages that + * user-configurable dirty ratio is the effective number of pages that * are allowed to be actually dirtied. Per individual zone, or * globally by using the sum of dirtyable pages over all zones. * @@ -387,8 +387,7 @@ static unsigned long global_dirtyable_memory(void) * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. + * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { @@ -436,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) if (bg_thresh >= thresh) bg_thresh = thresh / 2; tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } @@ -486,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) else dirty = vm_dirty_ratio * node_memory / 100; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + if (rt_task(tsk)) dirty += dirty / 4; return dirty; @@ -505,15 +504,13 @@ bool node_dirty_ok(struct pglist_data *pgdat) unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; } int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -524,8 +521,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, } int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -535,9 +531,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; @@ -551,8 +546,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, } int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; @@ -759,7 +753,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1567,7 +1561,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; @@ -1587,14 +1581,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; - /* - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); @@ -1653,8 +1640,12 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { - unsigned long intv = dirty_poll_interval(dirty, thresh); - unsigned long m_intv = ULONG_MAX; + unsigned long intv; + unsigned long m_intv; + +free_running: + intv = dirty_poll_interval(dirty, thresh); + m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; @@ -1673,9 +1664,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * Calculate global domain's pos_ratio and select the * global dtc by default. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(gdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + gdtc->wb_dirty < + dirty_freerun_ceiling(gdtc->wb_thresh, + gdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be throttled + * when below the per-wb freerun ceiling. + */ + goto free_running; + } + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); @@ -1689,9 +1691,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(mdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + mdtc->wb_dirty < + dirty_freerun_ceiling(mdtc->wb_thresh, + mdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be + * throttled when below the per-wb + * freerun ceiling. + */ + goto free_running; + } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); @@ -1938,8 +1951,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) @@ -1972,7 +1984,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; @@ -2164,7 +2176,6 @@ int write_cache_pages(struct address_space *mapping, int error; struct pagevec pvec; int nr_pages; - pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -2173,8 +2184,7 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec); if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; + index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 69827d4fa052..48eb0f1410d4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,6 +68,7 @@ #include <linux/lockdep.h> #include <linux/nmi.h> #include <linux/psi.h> +#include <linux/padata.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -302,14 +303,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -compound_page_dtor * const compound_page_dtors[] = { - NULL, - free_compound_page, +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + [NULL_COMPOUND_DTOR] = NULL, + [COMPOUND_PAGE_DTOR] = free_compound_page, #ifdef CONFIG_HUGETLB_PAGE - free_huge_page, + [HUGETLB_PAGE_DTOR] = free_huge_page, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - free_transhuge_page, + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif }; @@ -335,7 +336,6 @@ static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; static unsigned long dma_reserve __initdata; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; static unsigned long required_kernelcore __initdata; @@ -348,7 +348,6 @@ static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -609,8 +608,7 @@ static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, const char *reason, - unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason) { static unsigned long resume; static unsigned long nr_shown; @@ -639,10 +637,6 @@ static void bad_page(struct page *page, const char *reason, pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); __dump_page(page, reason); - bad_flags &= page->flags; - if (bad_flags) - pr_alert("bad because of flags: %#lx(%pGp)\n", - bad_flags, &bad_flags); dump_page_owner(page); print_modules(); @@ -1077,13 +1071,9 @@ static inline bool page_expected_state(struct page *page, return true; } -static void free_pages_check_bad(struct page *page) +static const char *page_bad_reason(struct page *page, unsigned long flags) { - const char *bad_reason; - unsigned long bad_flags; - - bad_reason = NULL; - bad_flags = 0; + const char *bad_reason = NULL; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; @@ -1091,24 +1081,32 @@ static void free_pages_check_bad(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; - bad_flags = PAGE_FLAGS_CHECK_AT_FREE; + if (unlikely(page->flags & flags)) { + if (flags == PAGE_FLAGS_CHECK_AT_PREP) + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; + else + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - bad_page(page, bad_reason, bad_flags); + return bad_reason; } -static inline int free_pages_check(struct page *page) +static void check_free_page_bad(struct page *page) +{ + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); +} + +static inline int check_free_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return 0; /* Something has gone sideways, find it */ - free_pages_check_bad(page); + check_free_page_bad(page); return 1; } @@ -1130,7 +1128,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 1: /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); + bad_page(page, "nonzero compound_mapcount"); goto out; } break; @@ -1142,17 +1140,17 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) break; default: if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); + bad_page(page, "corrupted mapping in tail page"); goto out; } break; } if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); + bad_page(page, "PageTail not set"); goto out; } if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); + bad_page(page, "compound_head not consistent"); goto out; } ret = 0; @@ -1194,7 +1192,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(free_pages_check(page + i))) { + if (unlikely(check_free_page(page + i))) { bad++; continue; } @@ -1206,7 +1204,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageKmemcg(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) - bad += free_pages_check(page); + bad += check_free_page(page); if (bad) return false; @@ -1253,7 +1251,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_check(page); + return check_free_page(page); else return false; } @@ -1274,7 +1272,7 @@ static bool free_pcp_prepare(struct page *page) static bool bulkfree_pcp_prepare(struct page *page) { - return free_pages_check(page); + return check_free_page(page); } #endif /* CONFIG_DEBUG_VM */ @@ -1499,45 +1497,49 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages(page, order); } -#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ - defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_NEED_MULTIPLE_NODES static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; -int __meminit early_pfn_to_nid(unsigned long pfn) +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { - static DEFINE_SPINLOCK(early_pfn_lock); + unsigned long start_pfn, end_pfn; int nid; - spin_lock(&early_pfn_lock); - nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid < 0) - nid = first_online_node; - spin_unlock(&early_pfn_lock); + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != NUMA_NO_NODE) { + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; + } return nid; } -#endif +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +int __meminit early_pfn_to_nid(unsigned long pfn) { + static DEFINE_SPINLOCK(early_pfn_lock); int nid; + spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0 && nid != node) - return false; - return true; -} + if (nid < 0) + nid = first_online_node; + spin_unlock(&early_pfn_lock); -#else -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return true; + return nid; } -#endif - +#endif /* CONFIG_NEED_MULTIPLE_NODES */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -1607,6 +1609,7 @@ void set_zone_contiguous(struct zone *zone) if (!__pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone)) return; + cond_resched(); } /* We confirm that there is no hole */ @@ -1691,7 +1694,6 @@ static void __init deferred_free_pages(unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - touch_nmi_watchdog(); } else { nr_free++; } @@ -1721,7 +1723,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - touch_nmi_watchdog(); } else { page++; } @@ -1815,16 +1816,43 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, return nr_pages; } +static void __init +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, + void *arg) +{ + unsigned long spfn, epfn; + struct zone *zone = arg; + u64 i; + + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); + + /* + * Initialize and free pages in MAX_ORDER sized increments so that we + * can avoid introducing any issues with the buddy allocator. + */ + while (spfn < end_pfn) { + deferred_init_maxorder(&i, zone, &spfn, &epfn); + cond_resched(); + } +} + +/* An arch may override for more concurrency. */ +__weak int __init +deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + return 1; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long spfn = 0, epfn = 0; unsigned long first_init_pfn, flags; unsigned long start = jiffies; struct zone *zone; - int zid; + int zid, max_threads; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1844,6 +1872,13 @@ static int __init deferred_init_memmap(void *data) BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); pgdat->first_deferred_pfn = ULONG_MAX; + /* + * Once we unlock here, the zone cannot be grown anymore, thus if an + * interrupt thread must allocate this early in boot, zone must be + * pre-grown prior to start of deferred page initialization. + */ + pgdat_resize_unlock(pgdat, &flags); + /* Only the highest zone is deferred so find it */ for (zid = 0; zid < MAX_NR_ZONES; zid++) { zone = pgdat->node_zones + zid; @@ -1856,21 +1891,30 @@ static int __init deferred_init_memmap(void *data) first_init_pfn)) goto zone_empty; - /* - * Initialize and free pages in MAX_ORDER sized increments so - * that we can avoid introducing any issues with the buddy - * allocator. - */ - while (spfn < epfn) - nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); -zone_empty: - pgdat_resize_unlock(pgdat, &flags); + max_threads = deferred_page_init_max_threads(cpumask); + while (spfn < epfn) { + unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); + struct padata_mt_job job = { + .thread_fn = deferred_init_memmap_chunk, + .fn_arg = zone, + .start = spfn, + .size = epfn_align - spfn, + .align = PAGES_PER_SECTION, + .min_chunk = PAGES_PER_SECTION, + .max_threads = max_threads, + }; + + padata_do_multithreaded(&job); + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + epfn_align); + } +zone_empty: /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", - pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); + pr_info("node %d deferred pages initialised in %ums\n", + pgdat->node_id, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1908,17 +1952,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order) pgdat_resize_lock(pgdat, &flags); /* - * If deferred pages have been initialized while we were waiting for - * the lock, return true, as the zone was grown. The caller will retry - * this zone. We won't return to this function since the caller also - * has this static branch. - */ - if (!static_branch_unlikely(&deferred_pages)) { - pgdat_resize_unlock(pgdat, &flags); - return true; - } - - /* * If someone grew this zone while we were waiting for spinlock, return * true, as there might be enough pages already. */ @@ -1946,6 +1979,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) first_deferred_pfn = spfn; nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + touch_nmi_watchdog(); /* We should only stop along section boundaries */ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) @@ -2091,31 +2125,14 @@ static inline void expand(struct zone *zone, struct page *page, static void check_new_page_bad(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; - - if (unlikely(atomic_read(&page->_mapcount) != -1)) - bad_reason = "nonzero mapcount"; - if (unlikely(page->mapping != NULL)) - bad_reason = "non-NULL mapping"; - if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _refcount"; if (unlikely(page->flags & __PG_HWPOISON)) { - bad_reason = "HWPoisoned (hardware-corrupted)"; - bad_flags = __PG_HWPOISON; /* Don't complain about hwpoisoned pages */ page_mapcount_reset(page); /* remove PageBuddy */ return; } - if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { - bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; - bad_flags = PAGE_FLAGS_CHECK_AT_PREP; - } -#ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) - bad_reason = "page still charged to cgroup"; -#endif - bad_page(page, bad_reason, bad_flags); + + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); } /* @@ -2400,6 +2417,14 @@ static inline void boost_watermark(struct zone *zone) if (!watermark_boost_factor) return; + /* + * Don't bother in zones that are unlikely to produce results. + * On small machines, including kdump capture kernels running + * in a small area, boosting the watermark can cause an out of + * memory situation immediately. + */ + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) + return; max_boost = mult_frac(zone->_watermark[WMARK_HIGH], watermark_boost_factor, 10000); @@ -2600,7 +2625,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, int order; bool ret; - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, ac->nodemask) { /* * Preserve at least one pageblock unless memory pressure @@ -2759,6 +2784,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; +#ifdef CONFIG_CMA + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (migratetype == MIGRATE_MOVABLE && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + return page; + } +#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { @@ -3455,7 +3494,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); * to check in the allocation paths if no pages are free. */ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages) { long min = mark; @@ -3500,7 +3539,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * are not met, then a high-order request also cannot go ahead * even if a suitable page happened to be free. */ - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) return false; /* If this is an order-0 request then the watermark is fine */ @@ -3533,14 +3572,15 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags) + int highest_zoneidx, unsigned int alloc_flags) { - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } static inline bool zone_watermark_fast(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, unsigned int alloc_flags) + unsigned long mark, int highest_zoneidx, + unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); long cma_pages = 0; @@ -3558,22 +3598,23 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && (free_pages - cma_pages) > + mark + z->lowmem_reserve[highest_zoneidx]) return true; - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages); } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx) + unsigned long mark, int highest_zoneidx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, 0, + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, free_pages); } @@ -3650,8 +3691,8 @@ retry: */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; - for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -3706,7 +3747,7 @@ retry: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) { + ac->highest_zoneidx, alloc_flags)) { int ret; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -3739,7 +3780,7 @@ retry: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) goto try_this_zone; continue; @@ -3898,7 +3939,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_RETRY_MAYFAIL) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) + if (ac->highest_zoneidx < ZONE_NORMAL) goto out; if (pm_suspended_storage()) goto out; @@ -4101,10 +4142,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla * Let's give them a good hope and keep retrying while the order-0 * watermarks are OK. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) + ac->highest_zoneidx, alloc_flags)) return true; } return false; @@ -4228,12 +4269,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; - enum zone_type high_zoneidx = ac->high_zoneidx; + enum zone_type highest_zoneidx = ac->highest_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -4276,7 +4317,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; #ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -4368,8 +4409,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * request even if all reclaimable pages are considered then we are * screwed and have to go OOM. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, - ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { unsigned long available; unsigned long reclaimable; unsigned long min_wmark = min_wmark_pages(zone); @@ -4383,7 +4424,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, - ac_classzone_idx(ac), alloc_flags, available); + ac->highest_zoneidx, alloc_flags, available); trace_reclaim_retry_zone(z, order, reclaimable, available, min_wmark, *no_progress_loops, wmark); if (wmark) { @@ -4502,7 +4543,7 @@ retry_cpuset: * could end up iterating over non-eligible zones endlessly. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) goto nopage; @@ -4589,7 +4630,7 @@ retry: if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* Attempt with potentially adjusted zonelist and alloc_flags */ @@ -4723,10 +4764,10 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { - ac->high_zoneidx = gfp_zone(gfp_mask); + ac->highest_zoneidx = gfp_zone(gfp_mask); ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; - ac->migratetype = gfpflags_to_migratetype(gfp_mask); + ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { *alloc_mask |= __GFP_HARDWALL; @@ -4762,7 +4803,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) * may get reset for allocations that ignore memory policies. */ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, ac->nodemask); + ac->highest_zoneidx, ac->nodemask); } /* @@ -5310,7 +5351,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", @@ -5323,7 +5364,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), global_node_page_state(NR_SLAB_RECLAIMABLE), global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), @@ -5356,7 +5396,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " anon_thp: %lukB" #endif " writeback_tmp:%lukB" - " unstable:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5378,7 +5417,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5411,6 +5449,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " managed:%lukB" " mlocked:%lukB" " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" @@ -5433,6 +5474,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), zone_page_state(zone, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + zone_page_state(zone, NR_KERNEL_SCS_KB), +#endif K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), @@ -5531,36 +5575,17 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -static __init int setup_numa_zonelist_order(char *s) -{ - if (!s) - return 0; - - return __parse_numa_zonelist_order(s); -} -early_param("numa_zonelist_order", setup_numa_zonelist_order); - char numa_zonelist_order[] = "Node"; /* * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { - char *str; - int ret; - - if (!write) - return proc_dostring(table, write, buffer, length, ppos); - str = memdup_user_nul(buffer, 16); - if (IS_ERR(str)) - return PTR_ERR(str); - - ret = __parse_numa_zonelist_order(str); - kfree(str); - return ret; + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); } @@ -5680,14 +5705,13 @@ static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; int node, load, nr_nodes = 0; - nodemask_t used_mask; + nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = nr_online_nodes; prev_node = local_node; - nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { @@ -5899,7 +5923,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat) static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP static struct memblock_region *r; if (mirrored_kernelcore && zone == ZONE_MOVABLE) { @@ -5915,27 +5938,9 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return true; } } -#endif return false; } -#ifdef CONFIG_SPARSEMEM -/* Skip PFNs that belong to non-present sections */ -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - const unsigned long section_nr = pfn_to_section_nr(++pfn); - - if (present_section_nr(section_nr)) - return pfn; - return section_nr_to_pfn(next_present_section_nr(section_nr)); -} -#else -static inline __meminit unsigned long next_pfn(unsigned long pfn) -{ - return pfn++; -} -#endif - /* * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is @@ -5975,14 +5980,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * function. They do not exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) { - pfn = next_pfn(pfn); - continue; - } - if (!early_pfn_in_nid(pfn, nid)) { - pfn++; - continue; - } if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -6098,9 +6095,23 @@ static void __meminit zone_init_free_lists(struct zone *zone) } void __meminit __weak memmap_init(unsigned long size, int nid, - unsigned long zone, unsigned long start_pfn) + unsigned long zone, + unsigned long range_start_pfn) { - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); + unsigned long start_pfn, end_pfn; + unsigned long range_end_pfn = range_start_pfn + size; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + + if (end_pfn > start_pfn) { + size = end_pfn - start_pfn; + memmap_init_zone(size, nid, zone, start_pfn, + MEMMAP_EARLY, NULL); + } + } } static int zone_batchsize(struct zone *zone) @@ -6252,10 +6263,25 @@ void __init setup_per_cpu_pageset(void) { struct pglist_data *pgdat; struct zone *zone; + int __maybe_unused cpu; for_each_populated_zone(zone) setup_zone_pageset(zone); +#ifdef CONFIG_NUMA + /* + * Unpopulated zones continue using the boot pagesets. + * The numa stats for these pagesets need to be reset. + * Otherwise, they will end up skewing the stats of + * the nodes these zones are associated with. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); + memset(pcp->vm_numa_stat_diff, 0, + sizeof(pcp->vm_numa_stat_diff)); + } +#endif + for_each_online_pgdat(pgdat) pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); @@ -6298,57 +6324,6 @@ void __meminit init_currently_empty_zone(struct zone *zone, zone->initialized = 1; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID - -/* - * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - */ -int __meminit __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) -{ - unsigned long start_pfn, end_pfn; - int nid; - - if (state->last_start <= pfn && pfn < state->last_end) - return state->last_nid; - - nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); - if (nid != NUMA_NO_NODE) { - state->last_start = start_pfn; - state->last_end = end_pfn; - state->last_nid = nid; - } - - return nid; -} -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ - -/** - * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range - * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. - * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid - * - * If an architecture guarantees that all ranges registered contain no holes - * and may be freed, this this function may be used instead of calling - * memblock_free_early_nid() manually. - */ -void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) -{ - unsigned long start_pfn, end_pfn; - int i, this_nid; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { - start_pfn = min(start_pfn, max_low_pfn); - end_pfn = min(end_pfn, max_low_pfn); - - if (start_pfn < end_pfn) - memblock_free_early_nid(PFN_PHYS(start_pfn), - (end_pfn - start_pfn) << PAGE_SHIFT, - this_nid); - } -} - /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -6461,8 +6436,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *ignored) + unsigned long *zone_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6526,8 +6500,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, static unsigned long __init zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *ignored) + unsigned long node_end_pfn) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; @@ -6574,45 +6547,9 @@ static unsigned long __init zone_absent_pages_in_node(int nid, return nr_absent; } -#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static inline unsigned long __init zone_spanned_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zone_start_pfn, - unsigned long *zone_end_pfn, - unsigned long *zones_size) -{ - unsigned int zone; - - *zone_start_pfn = node_start_pfn; - for (zone = 0; zone < zone_type; zone++) - *zone_start_pfn += zones_size[zone]; - - *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; - - return zones_size[zone_type]; -} - -static inline unsigned long __init zone_absent_pages_in_node(int nid, - unsigned long zone_type, - unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zholes_size) -{ - if (!zholes_size) - return 0; - - return zholes_size[zone_type]; -} - -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, - unsigned long node_end_pfn, - unsigned long *zones_size, - unsigned long *zholes_size) + unsigned long node_end_pfn) { unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; @@ -6620,17 +6557,21 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long spanned, absent; unsigned long size, real_size; - size = zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - &zone_start_pfn, - &zone_end_pfn, - zones_size); - real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, - node_start_pfn, node_end_pfn, - zholes_size); + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn); + + size = spanned; + real_size = size - absent; + if (size) zone->zone_start_pfn = zone_start_pfn; else @@ -6930,10 +6871,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= offset; -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif } @@ -6950,30 +6889,25 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __init free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, - unsigned long *zholes_size) +static void __init free_area_init_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ - WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pgdat->node_id = nid; - pgdat->node_start_pfn = node_start_pfn; + pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); -#else - start_pfn = node_start_pfn; -#endif - calculate_node_totalpages(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); pgdat_set_deferred_range(pgdat); @@ -6981,6 +6915,11 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } +void __init free_area_init_memoryless_node(int nid) +{ + free_area_init_node(nid); +} + #if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Initialize all valid struct pages in the range [spfn, epfn) and mark them @@ -7064,8 +7003,6 @@ static inline void __init init_unavailable_mem(void) } #endif /* !CONFIG_FLAT_NODE_MEM_MAP */ -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. @@ -7129,24 +7066,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/* Find the lowest pfn for a node */ -static unsigned long __init find_min_pfn_for_node(int nid) -{ - unsigned long min_pfn = ULONG_MAX; - unsigned long start_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) - min_pfn = min(min_pfn, start_pfn); - - if (min_pfn == ULONG_MAX) { - pr_warn("Could not find start_pfn for node %d\n", nid); - return 0; - } - - return min_pfn; -} - /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * @@ -7155,7 +7074,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) */ unsigned long __init find_min_pfn_with_active_regions(void) { - return find_min_pfn_for_node(MAX_NUMNODES); + return PHYS_PFN(memblock_start_of_DRAM()); } /* @@ -7208,7 +7127,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (!memblock_is_hotpluggable(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? @@ -7229,7 +7148,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (memblock_is_mirror(r)) continue; - nid = r->nid; + nid = memblock_get_region_node(r); usable_startpfn = memblock_region_memory_base_pfn(r); @@ -7244,7 +7163,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) } if (mem_below_4gb_not_mirrored) - pr_warn("This configuration results in unmirrored kernel memory."); + pr_warn("This configuration results in unmirrored kernel memory.\n"); goto out2; } @@ -7409,8 +7328,17 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } } +/* + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * such cases we allow max_zone_pfn sorted in the descending order + */ +bool __weak arch_has_descending_max_zone_pfns(void) +{ + return false; +} + /** - * free_area_init_nodes - Initialise all pg_data_t and zone data + * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. @@ -7422,10 +7350,11 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ -void __init free_area_init_nodes(unsigned long *max_zone_pfn) +void __init free_area_init(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int i, nid, zone; + bool descending; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -7434,14 +7363,20 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_highest_possible_pfn)); start_pfn = find_min_pfn_with_active_regions(); + descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { - if (i == ZONE_MOVABLE) + if (descending) + zone = MAX_NR_ZONES - i - 1; + else + zone = i; + + if (zone == ZONE_MOVABLE) continue; - end_pfn = max(max_zone_pfn[i], start_pfn); - arch_zone_lowest_possible_pfn[i] = start_pfn; - arch_zone_highest_possible_pfn[i] = end_pfn; + end_pfn = max(max_zone_pfn[zone], start_pfn); + arch_zone_lowest_possible_pfn[zone] = start_pfn; + arch_zone_highest_possible_pfn[zone] = end_pfn; start_pfn = end_pfn; } @@ -7494,8 +7429,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + free_area_init_node(nid); /* Any memory on that node */ if (pgdat->node_present_pages) @@ -7560,8 +7494,6 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - void adjust_managed_page_count(struct page *page, long count) { atomic_long_add(count, &page_zone(page)->managed_pages); @@ -7684,13 +7616,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -void __init free_area_init(unsigned long *zones_size) -{ - init_unavailable_mem(); - free_area_init_node(0, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); -} - static int page_alloc_cpu_dead(unsigned int cpu) { @@ -7808,9 +7733,10 @@ static void setup_per_zone_lowmem_reserve(void) idx--; lower_zone = pgdat->node_zones + idx; - if (sysctl_lowmem_reserve_ratio[idx] < 1) { - sysctl_lowmem_reserve_ratio[idx] = 0; + if (!sysctl_lowmem_reserve_ratio[idx] || + !zone_managed_pages(lower_zone)) { lower_zone->lowmem_reserve[j] = 0; + continue; } else { lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; @@ -7875,9 +7801,9 @@ static void __setup_per_zone_wmarks(void) mult_frac(zone_managed_pages(zone), watermark_scale_factor, 10000)); + zone->watermark_boost = 0; zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; - zone->watermark_boost = 0; spin_unlock_irqrestore(&zone->lock, flags); } @@ -7963,7 +7889,7 @@ core_initcall(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7978,20 +7904,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } -int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int rc; - - rc = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (rc) - return rc; - - return 0; -} - int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8021,7 +7935,7 @@ static void setup_min_unmapped_ratio(void) int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8048,7 +7962,7 @@ static void setup_min_slab_ratio(void) } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8072,9 +7986,17 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { + int i; + proc_dointvec_minmax(table, write, buffer, length, ppos); + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (sysctl_lowmem_reserve_ratio[i] < 1) + sysctl_lowmem_reserve_ratio[i] = 0; + } + setup_per_zone_lowmem_reserve(); return 0; } @@ -8094,7 +8016,7 @@ static void __zone_pcp_update(struct zone *zone) * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; @@ -8238,7 +8160,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags); virt = true; } else { /* @@ -8363,6 +8285,19 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) continue; + /* + * We treat all PageOffline() pages as movable when offlining + * to give drivers a chance to decrement their reference count + * in MEM_GOING_OFFLINE in order to indicate that these pages + * can be offlined as there are no direct references anymore. + * For actually unmovable PageOffline() where the driver does + * not support this, we will fail later when trying to actually + * move these pages that still have a reference count > 0. + * (false negatives in this function only) + */ + if ((flags & MEMORY_OFFLINE) && PageOffline(page)) + continue; + if (__PageMovable(page) || PageLRU(page)) continue; @@ -8402,7 +8337,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ - unsigned long nr_reclaimed; + unsigned int nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; @@ -8594,6 +8529,7 @@ done: pfn_max_align_up(end), migratetype); return ret; } +EXPORT_SYMBOL(alloc_contig_range); static int __alloc_contig_pages(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) @@ -8709,6 +8645,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) } WARN(count != 0, "%d pages are still in use!\n", count); } +EXPORT_SYMBOL(free_contig_range); /* * The zone indicated has a new number of managed_pages; batch sizes and percpu @@ -8781,6 +8718,17 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) offlined_pages++; continue; } + /* + * At this point all remaining PageOffline() pages have a + * reference count of 0 and can simply be skipped. + */ + if (PageOffline(page)) { + BUG_ON(page_count(page)); + BUG_ON(PageBuddy(page)); + pfn++; + offlined_pages++; + continue; + } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); diff --git a/mm/page_idle.c b/mm/page_idle.c index 295512465065..057c61df12db 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -4,6 +4,7 @@ #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kobject.h> +#include <linux/memory_hotplug.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/pagemap.h> @@ -30,13 +31,9 @@ */ static struct page *page_idle_get_page(unsigned long pfn) { - struct page *page; + struct page *page = pfn_to_online_page(pfn); pg_data_t *pgdat; - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); if (!page || !PageLRU(page) || !get_page_unless_zero(page)) return NULL; diff --git a/mm/page_io.c b/mm/page_io.c index 76965be1d40e..e8726f3e3820 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -25,7 +25,6 @@ #include <linux/psi.h> #include <linux/uio.h> #include <linux/sched/task.h> -#include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, struct page *page, bio_end_io_t end_io) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 2c11a38d6e87..f6d07c5f0d34 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -151,6 +151,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * a bit mask) * MEMORY_OFFLINE - isolate to offline (!allocate) memory * e.g., skip over PageHWPoison() pages + * and PageOffline() pages. * REPORT_FAILURE - report details about the failure to * isolate the range * @@ -259,6 +260,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) /* A HWPoisoned page cannot be also PageBuddy */ pfn++; + else if ((flags & MEMORY_OFFLINE) && PageOffline(page) && + !page_count(page)) + /* + * The responsible driver agreed to skip PageOffline() + * pages when offlining memory by dropping its + * reference in MEM_GOING_OFFLINE. + */ + pfn++; else break; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 18ecde9f45b2..360461509423 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page_owner = get_page_owner(page_ext); - page_mt = gfpflags_to_migratetype( - page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); if (pageblock_mt != page_mt) { if (is_migrate_cma(pageblock_mt)) count[MIGRATE_MOVABLE]++; @@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); + page_mt = gfp_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page) page_owner = get_page_owner(page_ext); gfp_mask = page_owner->gfp_mask; - mt = gfpflags_to_migratetype(gfp_mask); + mt = gfp_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); diff --git a/mm/page_reporting.h b/mm/page_reporting.h index aa6d37f4dc22..2c385dd4ddbd 100644 --- a/mm/page_reporting.h +++ b/mm/page_reporting.h @@ -7,7 +7,7 @@ #include <linux/page-isolation.h> #include <linux/jump_label.h> #include <linux/slab.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <linux/scatterlist.h> #define PAGE_REPORTING_MIN_ORDER pageblock_order diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 928df1638c30..e81640d9f177 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -373,7 +373,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, * caller-specific data to callbacks, @private should be helpful. * * Locking: - * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem, + * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, * because these function traverse vma list and/or access to vma's data. */ int walk_page_range(struct mm_struct *mm, unsigned long start, @@ -395,7 +395,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (!walk.mm) return -EINVAL; - lockdep_assert_held(&walk.mm->mmap_sem); + mmap_assert_locked(walk.mm); vma = find_vma(walk.mm, start); do { @@ -453,7 +453,7 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, if (start >= end || !walk.mm) return -EINVAL; - lockdep_assert_held(&walk.mm->mmap_sem); + mmap_assert_locked(walk.mm); return __walk_page_range(start, end, &walk); } @@ -472,7 +472,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; - lockdep_assert_held(&walk.mm->mmap_sem); + mmap_assert_locked(walk.mm); err = walk_page_test(vma->vm_start, vma->vm_end, &walk); if (err > 0) @@ -498,11 +498,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, * Also see walk_page_range() for additional information. * * Locking: - * This function can't require that the struct mm_struct::mmap_sem is held, + * This function can't require that the struct mm_struct::mmap_lock is held, * since @mapping may be mapped by multiple processes. Instead * @mapping->i_mmap_rwsem must be held. This might have implications in the * callbacks, and it's up tho the caller to ensure that the - * struct mm_struct::mmap_sem is not needed. + * struct mm_struct::mmap_lock is not needed. * * Also this means that a caller can't rely on the struct * vm_area_struct::vm_flags to be constant across a call, diff --git a/mm/percpu.c b/mm/percpu.c index d7e3bc649f4e..696367b18222 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,6 +80,7 @@ #include <linux/workqueue.h> #include <linux/kmemleak.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <asm/cacheflush.h> #include <asm/sections.h> @@ -481,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return kzalloc(size, gfp); else - return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO); } /** @@ -1557,10 +1558,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { - /* whitelisted flags that can be passed to the backing allocators */ - gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); - bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; - bool do_warn = !(gfp & __GFP_NOWARN); + gfp_t pcpu_gfp; + bool is_atomic; + bool do_warn; static int warn_limit = 10; struct pcpu_chunk *chunk, *next; const char *err; @@ -1569,6 +1569,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, void __percpu *ptr; size_t bits, bit_align; + gfp = current_gfp_context(gfp); + /* whitelisted flags that can be passed to the backing allocators */ + pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + do_warn = !(gfp & __GFP_NOWARN); + /* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3d7c01e76efc..9578db83e312 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -2,15 +2,15 @@ /* * mm/pgtable-generic.c * - * Generic pgtable methods declared in asm-generic/pgtable.h + * Generic pgtable methods declared in linux/pgtable.h * * Copyright (C) 2010 Linus Torvalds */ #include <linux/pagemap.h> #include <linux/hugetlb.h> +#include <linux/pgtable.h> #include <asm/tlb.h> -#include <asm-generic/pgtable.h> /* * If a p?d_bad entry is found while walking page tables, report @@ -53,7 +53,7 @@ void pmd_clear_bad(pmd_t *pmd) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* - * Only sets the access flags (dirty, accessed), as well as write + * Only sets the access flags (dirty, accessed), as well as write * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn @@ -194,7 +194,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 74e957e302fe..cc85ce81914a 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -104,12 +104,12 @@ static int process_vm_rw_single_vec(unsigned long addr, * access remotely because task/mm might not * current/current->mm */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages, flags, process_pages, NULL, &locked); if (locked) - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (pinned_pages <= 0) return -EFAULT; diff --git a/mm/ptdump.c b/mm/ptdump.c index 26208d0d03b7..ba88ec43ff21 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 0, pgd_val(val)); + if (pgd_leaf(val)) st->note_page(st, addr, 0, pgd_val(val)); @@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 1, p4d_val(val)); + if (p4d_leaf(val)) st->note_page(st, addr, 1, p4d_val(val)); @@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 2, pud_val(val)); + if (pud_leaf(val)) st->note_page(st, addr, 2, pud_val(val)); @@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 3, pmd_val(val)); if (pmd_leaf(val)) st->note_page(st, addr, 3, pmd_val(val)); @@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; + pte_t val = READ_ONCE(*pte); + + if (st->effective_prot) + st->effective_prot(st, 4, pte_val(val)); - st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte))); + st->note_page(st, addr, 4, pte_val(val)); return 0; } @@ -126,13 +141,13 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) { const struct ptdump_range *range = st->range; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); while (range->start != range->end) { walk_page_range_novma(mm, range->start, range->end, &ptdump_ops, pgd, st); range++; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* Flush out the last page */ st->note_page(st, 0, -1, 0); diff --git a/mm/readahead.c b/mm/readahead.c index 2fe72cd29b47..3c9a8dd7c56c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -22,6 +22,7 @@ #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> +#include <linux/sched/mm.h> #include "internal.h" @@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); -static int read_pages(struct address_space *mapping, struct file *filp, - struct list_head *pages, unsigned int nr_pages, gfp_t gfp) +static void read_pages(struct readahead_control *rac, struct list_head *pages, + bool skip_page) { + const struct address_space_operations *aops = rac->mapping->a_ops; + struct page *page; struct blk_plug plug; - unsigned page_idx; - int ret; + + if (!readahead_count(rac)) + goto out; blk_start_plug(&plug); - if (mapping->a_ops->readpages) { - ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + if (aops->readahead) { + aops->readahead(rac); + /* Clean up the remaining pages */ + while ((page = readahead_page(rac))) { + unlock_page(page); + put_page(page); + } + } else if (aops->readpages) { + aops->readpages(rac->file, rac->mapping, pages, + readahead_count(rac)); /* Clean up the remaining pages */ put_pages_list(pages); - goto out; - } - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) - mapping->a_ops->readpage(filp, page); - put_page(page); + rac->_index += rac->_nr_pages; + rac->_nr_pages = 0; + } else { + while ((page = readahead_page(rac))) { + aops->readpage(rac->file, page); + put_page(page); + } } - ret = 0; -out: blk_finish_plug(&plug); - return ret; + BUG_ON(!list_empty(pages)); + BUG_ON(readahead_count(rac)); + +out: + if (skip_page) + rac->_index++; } -/* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates - * the pages first, then submits them for I/O. This avoids the very bad - * behaviour which would occur if page allocations are causing VM writeback. - * We really don't want to intermingle reads and writes like that. +/** + * page_cache_readahead_unbounded - Start unchecked readahead. + * @mapping: File address space. + * @file: This instance of the open file; used for authentication. + * @index: First page index to read. + * @nr_to_read: The number of pages to read. + * @lookahead_size: Where to start the next readahead. * - * Returns the number of pages requested, or the maximum amount of I/O allowed. + * This function is for filesystems to call when they want to start + * readahead beyond a file's stated i_size. This is almost certainly + * not the function you want to call. Use page_cache_async_readahead() + * or page_cache_sync_readahead() instead. + * + * Context: File is referenced by caller. Mutexes may be held by caller. + * May sleep, but will not reenter filesystem to reclaim memory. */ -unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void page_cache_readahead_unbounded(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size) { - struct inode *inode = mapping->host; - struct page *page; - unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); - int page_idx; - unsigned int nr_pages = 0; - loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); + struct readahead_control rac = { + .mapping = mapping, + .file = file, + ._index = index, + }; + unsigned long i; - if (isize == 0) - goto out; - - end_index = ((isize - 1) >> PAGE_SHIFT); + /* + * Partway through the readahead operation, we will have added + * locked pages to the page cache, but will not yet have submitted + * them for I/O. Adding another page may need to allocate memory, + * which can trigger memory reclaim. Telling the VM we're in + * the middle of a filesystem operation will cause it to not + * touch file-backed pages, preventing a deadlock. Most (all?) + * filesystems already specify __GFP_NOFS in their mapping's + * gfp_mask, but let's be explicit here. + */ + unsigned int nofs = memalloc_nofs_save(); /* * Preallocate as many pages as we will need. */ - for (page_idx = 0; page_idx < nr_to_read; page_idx++) { - pgoff_t page_offset = offset + page_idx; + for (i = 0; i < nr_to_read; i++) { + struct page *page = xa_load(&mapping->i_pages, index + i); - if (page_offset > end_index) - break; + BUG_ON(index + i != rac._index + rac._nr_pages); - page = xa_load(&mapping->i_pages, page_offset); if (page && !xa_is_value(page)) { /* - * Page already present? Kick off the current batch of - * contiguous pages before continuing with the next - * batch. + * Page already present? Kick off the current batch + * of contiguous pages before continuing with the + * next batch. This page may be the one we would + * have intended to mark as Readahead, but we don't + * have a stable reference to this page, and it's + * not worth getting one just for that. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, - gfp_mask); - nr_pages = 0; + read_pages(&rac, &page_pool, true); continue; } page = __page_cache_alloc(gfp_mask); if (!page) break; - page->index = page_offset; - list_add(&page->lru, &page_pool); - if (page_idx == nr_to_read - lookahead_size) + if (mapping->a_ops->readpages) { + page->index = index + i; + list_add(&page->lru, &page_pool); + } else if (add_to_page_cache_lru(page, mapping, index + i, + gfp_mask) < 0) { + put_page(page); + read_pages(&rac, &page_pool, true); + continue; + } + if (i == nr_to_read - lookahead_size) SetPageReadahead(page); - nr_pages++; + rac._nr_pages++; } /* @@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); - BUG_ON(!list_empty(&page_pool)); -out: - return nr_pages; + read_pages(&rac, &page_pool, false); + memalloc_nofs_restore(nofs); +} +EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); + +/* + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + */ +void __do_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size) +{ + struct inode *inode = mapping->host; + loff_t isize = i_size_read(inode); + pgoff_t end_index; /* The last page we want to read */ + + if (isize == 0) + return; + + end_index = (isize - 1) >> PAGE_SHIFT; + if (index > end_index) + return; + /* Don't read past the page containing the last byte of the file */ + if (nr_to_read > end_index - index) + nr_to_read = end_index - index + 1; + + page_cache_readahead_unbounded(mapping, file, index, nr_to_read, + lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) +void force_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t index, unsigned long nr_to_read) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); struct file_ra_state *ra = &filp->f_ra; unsigned long max_pages; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) - return -EINVAL; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && + !mapping->a_ops->readahead)) + return; /* * If the request exceeds the readahead window, allow the read to @@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (this_chunk > nr_to_read) this_chunk = nr_to_read; - __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); + __do_page_cache_readahead(mapping, filp, index, this_chunk, 0); - offset += this_chunk; + index += this_chunk; nr_to_read -= this_chunk; } - return 0; } /* @@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, */ /* - * Count contiguously cached pages from @offset-1 to @offset-@max, + * Count contiguously cached pages from @index-1 to @index-@max, * this count is a conservative estimation of * - length of the sequential read sequence, or * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - pgoff_t offset, unsigned long max) + pgoff_t index, unsigned long max) { pgoff_t head; rcu_read_lock(); - head = page_cache_prev_miss(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, index - 1, max); rcu_read_unlock(); - return offset - 1 - head; + return index - 1 - head; } /* @@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping, */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, - pgoff_t offset, + pgoff_t index, unsigned long req_size, unsigned long max) { pgoff_t size; - size = count_history_pages(mapping, offset, max); + size = count_history_pages(mapping, index, max); /* * not enough history pages: @@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping, * starts from beginning of file: * it is a strong indication of long-run stream (or whole-file-read) */ - if (size >= offset) + if (size >= index) size *= 2; - ra->start = offset; + ra->start = index; ra->size = min(size + req_size, max); ra->async_size = 1; @@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping, /* * A minimal readahead algorithm for trivial sequential/random reads. */ -static unsigned long -ondemand_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - bool hit_readahead_marker, pgoff_t offset, - unsigned long req_size) +static void ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t index, + unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; unsigned long add_pages; - pgoff_t prev_offset; + pgoff_t prev_index; /* * If the request exceeds the readahead window, allow the read to @@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping, /* * start of file */ - if (!offset) + if (!index) goto initial_readahead; /* - * It's the expected callback offset, assume sequential access. + * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if ((offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((index == (ra->start + ra->size - ra->async_size) || + index == (ra->start + ra->size))) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_miss(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, index + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max_pages) - return 0; + if (!start || start - index > max_pages) + return; ra->start = start; - ra->size = start - offset; /* old async_size */ + ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss - * trivial case: (offset - prev_offset) == 1 - * unaligned reads: (offset - prev_offset) == 0 + * trivial case: (index - prev_index) == 1 + * unaligned reads: (index - prev_index) == 0 */ - prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; - if (offset - prev_offset <= 1UL) + prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; + if (index - prev_index <= 1UL) goto initial_readahead; /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) + if (try_context_readahead(mapping, ra, index, req_size, max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ - return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + __do_page_cache_readahead(mapping, filp, index, req_size, 0); + return; initial_readahead: - ra->start = offset; + ra->start = index; ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; @@ -478,7 +537,7 @@ readit: * the resulted next readahead window into the current one. * Take care of maximum IO pages as above. */ - if (offset == ra->start && ra->size == ra->async_size) { + if (index == ra->start && ra->size == ra->async_size) { add_pages = get_next_ra_size(ra, max_pages); if (ra->size + add_pages <= max_pages) { ra->async_size = add_pages; @@ -489,7 +548,7 @@ readit: } } - return ra_submit(ra, mapping, filp); + ra_submit(ra, mapping, filp); } /** @@ -497,9 +556,8 @@ readit: * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more @@ -508,7 +566,7 @@ readit: */ void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - pgoff_t offset, unsigned long req_size) + pgoff_t index, unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping, /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { - force_page_cache_readahead(mapping, filp, offset, req_size); + force_page_cache_readahead(mapping, filp, index, req_count); return; } /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, false, offset, req_size); + ondemand_readahead(mapping, ra, filp, false, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_readahead); @@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @page: the page at @offset which has the PG_readahead flag set - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which - * has the PG_readahead flag; this is a marker to suggest that the application + * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - struct page *page, pgoff_t offset, - unsigned long req_size) + struct page *page, pgoff_t index, + unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping, return; /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, true, offset, req_size); + ondemand_readahead(mapping, ra, filp, true, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index f79a206b271a..5fe2dedce1fc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -21,7 +21,7 @@ * Lock ordering in mm: * * inode->i_mutex (while writing or truncating, not reading or faulting) - * mm->mmap_sem + * mm->mmap_lock * page->flags PG_locked (lock_page) * (see huegtlbfs below) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem @@ -177,7 +177,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * to do any locking for the common case of already having * an anon_vma. * - * This must be called with the mmap_sem held for reading. + * This must be called with the mmap_lock held for reading. */ int __anon_vma_prepare(struct vm_area_struct *vma) { @@ -1114,6 +1114,11 @@ void do_page_add_anon_rmap(struct page *page, bool compound = flags & RMAP_COMPOUND; bool first; + if (unlikely(PageKsm(page))) + lock_page_memcg(page); + else + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (compound) { atomic_t *mapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1133,13 +1138,14 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_node_page_state(page, NR_ANON_THPS); - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) - return; - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (unlikely(PageKsm(page))) { + unlock_page_memcg(page); + return; + } /* address might be in next vma when migration races vma_adjust */ if (first) @@ -1174,14 +1180,14 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_node_page_state(page, NR_ANON_THPS); + __inc_lruvec_page_state(page, NR_ANON_THPS); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); } - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1230,13 +1236,12 @@ static void page_remove_file_rmap(struct page *page, bool compound) int i, nr = 1; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); - goto out; + return; } /* page still mapped by someone else? */ @@ -1246,14 +1251,14 @@ static void page_remove_file_rmap(struct page *page, bool compound) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; + return; if (PageSwapBacked(page)) __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); else __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + return; } /* @@ -1265,8 +1270,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) if (unlikely(PageMlocked(page))) clear_page_mlock(page); -out: - unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1283,7 +1286,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_node_page_state(page, NR_ANON_THPS); + __dec_lruvec_page_state(page, NR_ANON_THPS); if (TestClearPageDoubleMap(page)) { /* @@ -1310,7 +1313,7 @@ static void page_remove_anon_compound_rmap(struct page *page) clear_page_mlock(page); if (nr) - __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } /** @@ -1322,22 +1325,28 @@ static void page_remove_anon_compound_rmap(struct page *page) */ void page_remove_rmap(struct page *page, bool compound) { - if (!PageAnon(page)) - return page_remove_file_rmap(page, compound); + lock_page_memcg(page); - if (compound) - return page_remove_anon_compound_rmap(page); + if (!PageAnon(page)) { + page_remove_file_rmap(page, compound); + goto out; + } + + if (compound) { + page_remove_anon_compound_rmap(page); + goto out; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - return; + goto out; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __dec_node_page_state(page, NR_ANON_MAPPED); + __dec_lruvec_page_state(page, NR_ANON_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1354,6 +1363,8 @@ void page_remove_rmap(struct page *page, bool compound) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ +out: + unlock_page_memcg(page); } /* @@ -1433,7 +1444,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!PageTransCompound(page)) { /* * Holding pte lock, we do *not* need - * mmap_sem here + * mmap_lock here */ mlock_vma_page(page); } @@ -1806,7 +1817,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, /* * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages - * are holding mmap_sem. Users without mmap_sem are required to + * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ anon_vma = page_anon_vma(page); @@ -1826,7 +1837,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. * - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * When called from try_to_munlock(), the mmap_lock of the mm containing the vma * where the page was found will be held for write. So, we won't recheck * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. @@ -1878,7 +1889,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. * - * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * When called from try_to_munlock(), the mmap_lock of the mm containing the vma * where the page was found will be held for write. So, we won't recheck * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 5e313fa93276..2a99df7beeb3 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -25,7 +25,7 @@ void rodata_test(void) } /* test 2: write to the variable; this should fault */ - if (!probe_kernel_write((void *)&rodata_test_data, + if (!copy_to_kernel_nofault((void *)&rodata_test_data, (void *)&zero, sizeof(zero))) { pr_err("test data was not read only\n"); return; diff --git a/mm/shmem.c b/mm/shmem.c index d722eb830317..a0dbe62f8042 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -82,7 +82,6 @@ static struct vfsmount *shm_mnt; #include <linux/uuid.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include "internal.h" @@ -605,11 +604,13 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp) + pgoff_t index, void *expected, gfp_t gfp, + struct mm_struct *charge_mm) { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); unsigned long i = 0; unsigned long nr = compound_nr(page); + int error; VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -621,6 +622,18 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; + if (!PageSwapCache(page)) { + error = mem_cgroup_charge(page, charge_mm, gfp); + if (error) { + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto error; + } + } + cgroup_throttle_swaprate(page, gfp); + do { void *entry; xas_lock_irq(&xas); @@ -641,19 +654,22 @@ next: __inc_node_page_state(page, NR_SHMEM_THPS); } mapping->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); + __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - page->mapping = NULL; - page_ref_sub(page, nr); - return xas_error(&xas); + error = xas_error(&xas); + goto error; } return 0; +error: + page->mapping = NULL; + page_ref_sub(page, nr); + return error; } /* @@ -670,8 +686,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - __dec_node_page_state(page, NR_SHMEM); + __dec_lruvec_page_state(page, NR_FILE_PAGES); + __dec_lruvec_page_state(page, NR_SHMEM); xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); @@ -952,7 +968,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, VM_BUG_ON_PAGE(PageWriteback(page), page); if (shmem_punch_compound(page, start, end)) truncate_inode_page(mapping, page); - else { + else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { /* Wipe the page and don't get stuck */ clear_highpage(page); flush_dcache_page(page); @@ -1578,8 +1594,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, xa_lock_irq(&swap_mapping->i_pages); error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { - __inc_node_page_state(newpage, NR_FILE_PAGES); - __dec_node_page_state(oldpage, NR_FILE_PAGES); + mem_cgroup_migrate(oldpage, newpage); + __inc_lruvec_page_state(newpage, NR_FILE_PAGES); + __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1591,8 +1608,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage); - lru_cache_add_anon(newpage); + lru_cache_add(newpage); *pagep = newpage; } @@ -1619,7 +1635,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; - struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1664,31 +1679,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, goto failed; } - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap), gfp); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap), gfp, + charge_mm); if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true, false); - spin_lock_irq(&info->lock); info->swapped--; shmem_recalc_inode(inode); @@ -1734,7 +1730,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; - struct mem_cgroup *memcg; struct page *page; enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; @@ -1859,25 +1854,12 @@ alloc_nohuge: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); - if (error) { - if (PageTransHuge(page)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto unacct; - } error = shmem_add_to_page_cache(page, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK); - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); + NULL, gfp & GFP_RECLAIM_MASK, + charge_mm); + if (error) goto unacct; - } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); + lru_cache_add(page); spin_lock_irq(&info->lock); info->alloced += compound_nr(page); @@ -2179,7 +2161,11 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; - spin_lock_irq(&info->lock); + /* + * What serializes the accesses to info->flags? + * ipc_lock_object() when called from shmctl_do_lock(), + * no serialization needed when called from shm_destroy(). + */ if (lock && !(info->flags & VM_LOCKED)) { if (!user_shm_lock(inode->i_size, user)) goto out_nomem; @@ -2194,7 +2180,6 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) retval = 0; out_nomem: - spin_unlock_irq(&info->lock); return retval; } @@ -2311,7 +2296,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - struct mem_cgroup *memcg; spinlock_t *ptl; void *page_kaddr; struct page *page; @@ -2335,7 +2319,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, PAGE_SIZE); kunmap_atomic(page_kaddr); - /* fallback to copy_from_user outside mmap_sem */ + /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { *pagep = page; shmem_inode_unacct_blocks(inode, 1); @@ -2361,16 +2345,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release; - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); - if (ret) - goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK); + gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) - goto out_release_uncharge; - - mem_cgroup_commit_charge(page, memcg, false, false); + goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2391,19 +2369,19 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) - goto out_release_uncharge_unlock; + goto out_release_unlock; ret = -EEXIST; if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; + goto out_release_unlock; - lru_cache_add_anon(page); + lru_cache_add(page); - spin_lock(&info->lock); + spin_lock_irq(&info->lock); info->alloced++; inode->i_blocks += BLOCKS_PER_PAGE; shmem_recalc_inode(inode); - spin_unlock(&info->lock); + spin_unlock_irq(&info->lock); inc_mm_counter(dst_mm, mm_counter_file(page)); page_add_file_rmap(page, false); @@ -2416,12 +2394,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, ret = 0; out: return ret; -out_release_uncharge_unlock: +out_release_unlock: pte_unmap_unlock(dst_pte, ptl); ClearPageDirty(page); delete_from_page_cache(page); -out_release_uncharge: - mem_cgroup_cancel_charge(page, memcg, false); out_release: unlock_page(page); put_page(page); @@ -4160,7 +4136,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) loff_t size = vma->vm_end - vma->vm_start; /* - * Cloning a new file under mmap_sem leads to a lock ordering conflict + * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). diff --git a/mm/slab.c b/mm/slab.c index a89633603b2d..9350062ffc1a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3106,7 +3106,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *obj = NULL; struct page *page; int nid; @@ -3124,7 +3124,7 @@ retry: * Look through allowed nodes for objects available * from existing per node queues. */ - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { nid = zone_to_nid(zone); if (cpuset_zone_allowed(zone, flags) && diff --git a/mm/slab_common.c b/mm/slab_common.c index 23c7500eea7d..9e72ba224175 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags) kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( kmalloc_info[i].name[KMALLOC_DMA], kmalloc_info[i].size, - SLAB_CACHE_DMA | flags, 0, 0); + SLAB_CACHE_DMA | flags, 0, + kmalloc_info[i].size); } } #endif diff --git a/mm/slob.c b/mm/slob.c index fa53e9f73893..ac2aecfbc7a8 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -524,6 +524,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) { return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); } +EXPORT_SYMBOL(__kmalloc_track_caller); #ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, @@ -531,6 +532,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, { return __do_kmalloc_node(size, gfp, node, caller); } +EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif void kfree(const void *block) diff --git a/mm/slub.c b/mm/slub.c index 332d4b459a90..fe81773fd97e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -292,7 +292,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) return get_freepointer(s, object); freepointer_addr = (unsigned long)object + s->offset; - probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p)); + copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p)); return freelist_ptr(s, p, freepointer_addr); } @@ -551,15 +551,32 @@ static void print_section(char *level, char *text, u8 *addr, metadata_access_disable(); } +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; +} + static struct track *get_track(struct kmem_cache *s, void *object, enum track_item alloc) { struct track *p; - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + p = object + get_info_end(s); return p + alloc; } @@ -662,6 +679,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) va_end(args); } +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + if ((s->flags & SLAB_CONSISTENCY_CHECKS) && + !check_valid_pointer(s, page, nextfree)) { + object_err(s, page, freelist, "Freechain corrupt"); + freelist = NULL; + slab_fix(s, "Isolate corrupted freechain"); + return true; + } + + return false; +} + static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ @@ -686,10 +717,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); - if (s->offset) - off = s->offset + sizeof(void *); - else - off = s->inuse; + off = get_info_end(s); if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); @@ -782,7 +810,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * object address * Bytes of the object to be managed. * If the freepointer may overlay the object then the free - * pointer is the first word of the object. + * pointer is at the middle of the object. * * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) @@ -816,11 +844,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) { - unsigned long off = s->inuse; /* The end of info */ - - if (s->offset) - /* Freepointer is placed after the object. */ - off += sizeof(void *); + unsigned long off = get_info_end(s); /* The end of info */ if (s->flags & SLAB_STORE_USER) /* We also have user information there */ @@ -907,7 +931,7 @@ static int check_object(struct kmem_cache *s, struct page *page, check_pad_bytes(s, page, p); } - if (!s->offset && val == SLUB_RED_ACTIVE) + if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -1400,6 +1424,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + return false; +} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1909,7 +1938,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(flags); void *object; unsigned int cpuset_mems_cookie; @@ -1938,7 +1967,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); @@ -1984,7 +2013,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, #ifdef CONFIG_PREEMPTION /* - * Calculate the next globally unique transaction for disambiguiation + * Calculate the next globally unique transaction for disambiguation * during cmpxchg. The transactions start with the cpu number and are then * incremented by CONFIG_NR_CPUS. */ @@ -2083,6 +2112,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *prior; unsigned long counters; + /* + * If 'nextfree' is invalid, it is possible that the object at + * 'freelist' is already corrupted. So isolate all objects + * starting at 'freelist'. + */ + if (freelist_corrupted(s, page, freelist, nextfree)) + break; + do { prior = page->freelist; counters = page->counters; @@ -3533,6 +3570,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; + unsigned int freepointer_area; unsigned int order; /* @@ -3541,6 +3579,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the possible location of the free pointer. */ size = ALIGN(size, sizeof(void *)); + /* + * This is the area of the object where a freepointer can be + * safely written. If redzoning adds more to the inuse size, we + * can't use that portion for writing the freepointer, so + * s->offset must be limited within this for the general case. + */ + freepointer_area = size; #ifdef CONFIG_SLUB_DEBUG /* @@ -3579,16 +3624,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * * This is the case if we do RCU, have a constructor or * destructor or are poisoning the objects. + * + * The assumption that s->offset >= s->inuse means free + * pointer is outside of the object is used in the + * freeptr_outside_object() function. If that is no + * longer true, the function needs to be modified. */ s->offset = size; size += sizeof(void *); - } else if (size > sizeof(void *)) { + } else if (freepointer_area > sizeof(void *)) { /* * Store freelist pointer near middle of object to keep * it away from the edges of the object to avoid small * sized over/underflows from neighboring allocations. */ - s->offset = ALIGN(size / 2, sizeof(void *)); + s->offset = ALIGN(freepointer_area / 2, sizeof(void *)); } #ifdef CONFIG_SLUB_DEBUG @@ -3716,12 +3766,14 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text) + const char *text, unsigned long *map) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map; + + if (!map) + return; slab_err(s, page, text, s->name); slab_lock(page); @@ -3734,8 +3786,6 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } - put_map(map); - slab_unlock(page); #endif } @@ -3749,6 +3799,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; + unsigned long *map = NULL; + +#ifdef CONFIG_SLUB_DEBUG + map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); +#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3758,11 +3813,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()"); + "Objects remaining in %s on __kmem_cache_shutdown()", + map); } } spin_unlock_irq(&n->list_lock); +#ifdef CONFIG_SLUB_DEBUG + bitmap_free(map); +#endif + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } @@ -4385,6 +4445,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) return ret; } +EXPORT_SYMBOL(__kmalloc_track_caller); #ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, @@ -4415,6 +4476,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } +EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif #ifdef CONFIG_SYSFS @@ -5631,7 +5693,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) && + !IS_ENABLED(CONFIG_SLUB_STATS)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5665,19 +5728,6 @@ static struct kobj_type slab_ktype = { .release = kmem_cache_release, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &slab_ktype) - return 1; - return 0; -} - -static const struct kset_uevent_ops slab_uevent_ops = { - .filter = uevent_filter, -}; - static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) @@ -5745,7 +5795,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); out: kobject_put(&s->kobj); } @@ -5786,8 +5835,10 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = kset; err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) + if (err) { + kobject_put(&s->kobj); goto out; + } err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) @@ -5803,7 +5854,6 @@ static int sysfs_slab_add(struct kmem_cache *s) } #endif - kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); @@ -5884,7 +5934,7 @@ static int __init slab_sysfs_init(void) mutex_lock(&slab_mutex); - slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); pr_err("Cannot register slab subsystem.\n"); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 200aef686722..0db7738d76e9 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -29,7 +29,6 @@ #include <linux/sched.h> #include <asm/dma.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> /* * Allocate a block of memory to be used to back the virtual memory map diff --git a/mm/sparse.c b/mm/sparse.c index 1aee5a481571..b2b9a3e34696 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -17,7 +17,6 @@ #include "internal.h" #include <asm/dma.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> /* * Permanent SPARSEMEM data: @@ -288,7 +287,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) /* * Mark all memblocks as present using memory_present(). This is a - * convienence function that is useful for a number of arches + * convenience function that is useful for a number of arches * to mark all of the systems memory as present during initialization. */ void __init memblocks_present(void) diff --git a/mm/swap.c b/mm/swap.c index bf9a79fed62d..dbcab84c6fce 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -35,6 +35,7 @@ #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> +#include <linux/local_lock.h> #include "internal.h" @@ -44,14 +45,32 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); -static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); +/* Protecting only lru_rotate.pvec which requires disabling interrupts */ +struct lru_rotate { + local_lock_t lock; + struct pagevec pvec; +}; +static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + +/* + * The following struct pagevec are grouped together because they are protected + * by disabling preemption (and interrupts remain enabled). + */ +struct lru_pvecs { + local_lock_t lock; + struct pagevec lru_add; + struct pagevec lru_deactivate_file; + struct pagevec lru_deactivate; + struct pagevec lru_lazyfree; #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + struct pagevec activate_page; #endif +}; +static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { + .lock = INIT_LOCAL_LOCK(lock), +}; /* * This path almost never happens for VM activity - pages are normally @@ -83,8 +102,6 @@ static void __put_single_page(struct page *page) static void __put_compound_page(struct page *page) { - compound_page_dtor *dtor; - /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set @@ -93,8 +110,7 @@ static void __put_compound_page(struct page *page) */ if (!PageHuge(page)) __page_cache_release(page); - dtor = get_compound_page_dtor(page); - (*dtor)(page); + destroy_compound_page(page); } void __put_page(struct page *page) @@ -225,7 +241,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, page_lru(page)); ClearPageActive(page); add_page_to_lru_list_tail(page, lruvec, page_lru(page)); - (*pgmoved)++; + (*pgmoved) += hpage_nr_pages(page); } } @@ -254,30 +270,57 @@ void rotate_reclaimable_page(struct page *page) unsigned long flags; get_page(page); - local_irq_save(flags); - pvec = this_cpu_ptr(&lru_rotate_pvecs); + local_lock_irqsave(&lru_rotate.lock, flags); + pvec = this_cpu_ptr(&lru_rotate.pvec); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_move_tail(pvec); - local_irq_restore(flags); + local_unlock_irqrestore(&lru_rotate.lock, flags); } } -static void update_page_reclaim_stat(struct lruvec *lruvec, - int file, int rotated) +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + do { + unsigned long lrusize; + + /* Record cost event */ + if (file) + lruvec->file_cost += nr_pages; + else + lruvec->anon_cost += nr_pages; + + /* + * Decay previous events + * + * Because workloads change over time (and to avoid + * overflow) we keep these statistics as a floating + * average, which ends up weighing recent refaults + * more than old ones. + */ + lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + + lruvec_page_state(lruvec, NR_ACTIVE_FILE); + + if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { + lruvec->file_cost /= 2; + lruvec->anon_cost /= 2; + } + } while ((lruvec = parent_lruvec(lruvec))); +} - reclaim_stat->recent_scanned[file]++; - if (rotated) - reclaim_stat->recent_rotated[file]++; +void lru_note_cost_page(struct page *page) +{ + lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), + page_is_file_lru(page), hpage_nr_pages(page)); } static void __activate_page(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_lru(page); int lru = page_lru_base_type(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, lru); SetPageActive(page); @@ -285,15 +328,16 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, lru); trace_mm_lru_activate(page); - __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(lruvec, file, 1); + __count_vm_events(PGACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, + nr_pages); } } #ifdef CONFIG_SMP static void activate_page_drain(int cpu) { - struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, __activate_page, NULL); @@ -301,19 +345,21 @@ static void activate_page_drain(int cpu) static bool need_activate_page_drain(int cpu) { - return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; + return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; } void activate_page(struct page *page) { page = compound_head(page); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, __activate_page, NULL); - put_cpu_var(activate_page_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -335,9 +381,12 @@ void activate_page(struct page *page) static void __lru_cache_activate_page(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + struct pagevec *pvec; int i; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); + /* * Search backwards on the optimistic assumption that the page being * activated has just been added to this pagevec. Note that only @@ -357,7 +406,7 @@ static void __lru_cache_activate_page(struct page *page) } } - put_cpu_var(lru_add_pvec); + local_unlock(&lru_pvecs.lock); } /* @@ -385,7 +434,7 @@ void mark_page_accessed(struct page *page) } else if (!PageActive(page)) { /* * If the page is on the LRU, queue it for activation via - * activate_page_pvecs. Otherwise, assume the page is on a + * lru_pvecs.activate_page. Otherwise, assume the page is on a * pagevec, mark it active and it'll be moved to the active * LRU on the next drain. */ @@ -402,35 +451,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -static void __lru_cache_add(struct page *page) -{ - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); - - get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) - __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvec); -} - -/** - * lru_cache_add_anon - add a page to the page lists - * @page: the page to add - */ -void lru_cache_add_anon(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} - -void lru_cache_add_file(struct page *page) -{ - if (PageActive(page)) - ClearPageActive(page); - __lru_cache_add(page); -} -EXPORT_SYMBOL(lru_cache_add_file); - /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. @@ -442,10 +462,19 @@ EXPORT_SYMBOL(lru_cache_add_file); */ void lru_cache_add(struct page *page) { + struct pagevec *pvec; + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); - __lru_cache_add(page); + + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); + if (!pagevec_add(pvec, page) || PageCompound(page)) + __pagevec_lru_add(pvec); + local_unlock(&lru_pvecs.lock); } +EXPORT_SYMBOL(lru_cache_add); /** * lru_cache_add_active_or_unevictable @@ -501,8 +530,9 @@ void lru_cache_add_active_or_unevictable(struct page *page, static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, void *arg) { - int lru, file; + int lru; bool active; + int nr_pages = hpage_nr_pages(page); if (!PageLRU(page)) return; @@ -515,7 +545,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, return; active = PageActive(page); - file = page_is_file_lru(page); lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + active); @@ -536,28 +565,31 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, * We moves tha page into tail of inactive. */ add_page_to_lru_list_tail(page, lruvec, lru); - __count_vm_event(PGROTATED); + __count_vm_events(PGROTATED, nr_pages); } - if (active) - __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); + if (active) { + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); + } } static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_lru(page); int lru = page_lru_base_type(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); ClearPageActive(page); ClearPageReferenced(page); add_page_to_lru_list(page, lruvec, lru); - __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); - update_page_reclaim_stat(lruvec, file, 0); + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); } } @@ -567,6 +599,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { bool active = PageActive(page); + int nr_pages = hpage_nr_pages(page); del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active); @@ -580,9 +613,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, ClearPageSwapBacked(page); add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); - __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); - count_memcg_page_event(page, PGLAZYFREE); - update_page_reclaim_stat(lruvec, 1, 0); + __count_vm_events(PGLAZYFREE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, + nr_pages); } } @@ -593,30 +626,30 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); + struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_rotate_pvecs, cpu); + pvec = &per_cpu(lru_rotate.pvec, cpu); if (pagevec_count(pvec)) { unsigned long flags; /* No harm done if a racing interrupt already did this */ - local_irq_save(flags); + local_lock_irqsave(&lru_rotate.lock, flags); pagevec_move_tail(pvec); - local_irq_restore(flags); + local_unlock_irqrestore(&lru_rotate.lock, flags); } - pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - pvec = &per_cpu(lru_lazyfree_pvecs, cpu); + pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -641,11 +674,14 @@ void deactivate_file_page(struct page *page) return; if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - put_cpu_var(lru_deactivate_file_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -660,12 +696,14 @@ void deactivate_file_page(struct page *page) void deactivate_page(struct page *page) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + local_unlock(&lru_pvecs.lock); } } @@ -680,19 +718,30 @@ void mark_page_lazyfree(struct page *page) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); + struct pagevec *pvec; + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); - put_cpu_var(lru_lazyfree_pvecs); + local_unlock(&lru_pvecs.lock); } } void lru_add_drain(void) { - lru_add_drain_cpu(get_cpu()); - put_cpu(); + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + local_unlock(&lru_pvecs.lock); +} + +void lru_add_drain_cpu_zone(struct zone *zone) +{ + local_lock(&lru_pvecs.lock); + lru_add_drain_cpu(smp_processor_id()); + drain_local_pages(zone); + local_unlock(&lru_pvecs.lock); } #ifdef CONFIG_SMP @@ -743,11 +792,11 @@ void lru_add_drain_all(void) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || - pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || + if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + pagevec_count(&per_cpu(lru_rotate.pvec, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || + pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); @@ -890,8 +939,6 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *list) { - const int file = 0; - VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(PageCompound(page_tail), page); VM_BUG_ON_PAGE(PageLRU(page_tail), page); @@ -917,9 +964,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, add_page_to_lru_list_tail(page_tail, lruvec, page_lru(page_tail)); } - - if (!PageUnevictable(page)) - update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -928,6 +972,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, { enum lru_list lru; int was_unevictable = TestClearPageUnevictable(page); + int nr_pages = hpage_nr_pages(page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -962,16 +1007,14 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, if (page_evictable(page)) { lru = page_lru(page); - update_page_reclaim_stat(lruvec, page_is_file_lru(page), - PageActive(page)); if (was_unevictable) - count_vm_event(UNEVICTABLE_PGRESCUED); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { lru = LRU_UNEVICTABLE; ClearPageActive(page); SetPageUnevictable(page); if (!was_unevictable) - count_vm_event(UNEVICTABLE_PGCULLED); + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } add_page_to_lru_list(page, lruvec, lru); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 45affaef3bc6..7f34343c075a 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) - return 0; - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); @@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; - if (!do_swap_account) - return; - mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; diff --git a/mm/swap_state.c b/mm/swap_state.c index ebed37bbf7a3..e98ff460e9e9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -22,7 +22,6 @@ #include <linux/swap_slots.h> #include <linux/huge_mm.h> -#include <asm/pgtable.h> /* * swapper_space is a fiction, retained to simplify the path through @@ -360,12 +359,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page = NULL, *new_page = NULL; struct swap_info_struct *si; - int err; + struct page *page; + *new_page_allocated = false; - do { + for (;;) { + int err; /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling @@ -373,12 +373,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ si = get_swap_device(entry); if (!si) - break; - found_page = find_get_page(swap_address_space(entry), - swp_offset(entry)); + return NULL; + page = find_get_page(swap_address_space(entry), + swp_offset(entry)); put_swap_device(si); - if (found_page) - break; + if (page) + return page; /* * Just skip read ahead for unused swap slot. @@ -389,54 +389,71 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * else swap_off will be aborted if we return NULL. */ if (!__swp_swapcount(entry) && swap_slot_cache_enabled) - break; + return NULL; /* - * Get a new page to read into from swap. + * Get a new page to read into from swap. Allocate it now, + * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will + * cause any racers to loop around until we add it to cache. */ - if (!new_page) { - new_page = alloc_page_vma(gfp_mask, vma, addr); - if (!new_page) - break; /* Out of memory */ - } + page = alloc_page_vma(gfp_mask, vma, addr); + if (!page) + return NULL; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { - /* - * We might race against get_swap_page() and stumble - * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet. - */ - cond_resched(); - continue; - } else if (err) /* swp entry is obsolete ? */ + if (!err) break; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); - if (likely(!err)) { - /* Initiate read into locked page */ - SetPageWorkingset(new_page); - lru_cache_add_anon(new_page); - *new_page_allocated = true; - return new_page; - } - __ClearPageLocked(new_page); + put_page(page); + if (err != -EEXIST) + return NULL; + /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. + * We might race against __delete_from_swap_cache(), and + * stumble across a swap_map entry whose SWAP_HAS_CACHE + * has not yet been cleared. Or race against another + * __read_swap_cache_async(), which has set SWAP_HAS_CACHE + * in swap_map, but not yet added its page to swap cache. */ - put_swap_page(new_page, entry); - } while (err != -ENOMEM); + cond_resched(); + } + + /* + * The swap entry is ours to swap in. Prepare the new page. + */ + + __SetPageLocked(page); + __SetPageSwapBacked(page); + + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + put_swap_page(page, entry); + goto fail_unlock; + } + + if (mem_cgroup_charge(page, NULL, gfp_mask)) { + delete_from_swap_cache(page); + goto fail_unlock; + } + + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); + + /* Caller will initiate read into locked page */ + SetPageWorkingset(page); + lru_cache_add(page); + *new_page_allocated = true; + return page; - if (new_page) - put_page(new_page); - return found_page; +fail_unlock: + unlock_page(page); + put_page(page); + return NULL; } /* @@ -509,10 +526,11 @@ static unsigned long swapin_nr_pages(unsigned long offset) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); - pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, + max_pages, atomic_read(&last_readahead_pages)); if (!hits) - prev_offset = offset; + WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; @@ -534,7 +552,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) * This has been extended to use the NUMA policies from the mm triggering * the readahead. * - * Caller must hold read mmap_sem if vmf->vma is not NULL. + * Caller must hold read mmap_lock if vmf->vma is not NULL. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) @@ -716,7 +734,7 @@ static void swap_ra_info(struct vm_fault *vmf, * Primitive swap readahead code. We simply read in a few pages whoes * virtual addresses are around the fault address in the same vma. * - * Caller must hold read mmap_sem if vmf->vma is not NULL. + * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, diff --git a/mm/swapfile.c b/mm/swapfile.c index 5871a2aa86a5..987276c557d1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -40,7 +40,6 @@ #include <linux/swap_slots.h> #include <linux/sort.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> @@ -601,7 +600,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, { struct percpu_cluster *cluster; struct swap_cluster_info *ci; - bool found_free; unsigned long tmp, max; new_cluster: @@ -614,17 +612,17 @@ new_cluster: } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); - *scan_base = *offset = si->cluster_next; + *scan_base = this_cpu_read(*si->cluster_next_cpu); + *offset = *scan_base; goto new_cluster; } else return false; } - found_free = false; - /* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster @@ -632,27 +630,23 @@ new_cluster: tmp = cluster->next; max = min_t(unsigned long, si->max, (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); - if (tmp >= max) { - cluster_set_null(&cluster->index); - goto new_cluster; - } - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (!si->swap_map[tmp]) { - found_free = true; - break; + if (tmp < max) { + ci = lock_cluster(si, tmp); + while (tmp < max) { + if (!si->swap_map[tmp]) + break; + tmp++; } - tmp++; + unlock_cluster(ci); } - unlock_cluster(ci); - if (!found_free) { + if (tmp >= max) { cluster_set_null(&cluster->index); goto new_cluster; } cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; - return found_free; + return true; } static void __del_from_avail_list(struct swap_info_struct *p) @@ -729,6 +723,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, } } +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +{ + unsigned long prev; + + if (!(si->flags & SWP_SOLIDSTATE)) { + si->cluster_next = next; + return; + } + + prev = this_cpu_read(*si->cluster_next_cpu); + /* + * Cross the swap address space size aligned trunk, choose + * another trunk randomly to avoid lock contention on swap + * address space if possible. + */ + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { + /* No free swap slots available */ + if (si->highest_bit <= si->lowest_bit) + return; + next = si->lowest_bit + + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); + next = max_t(unsigned int, next, si->lowest_bit); + } + this_cpu_write(*si->cluster_next_cpu, next); +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -739,9 +761,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int n_ret = 0; - - if (nr > SWAP_BATCH) - nr = SWAP_BATCH; + bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -755,17 +775,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si, */ si->flags += SWP_SCANNING; - scan_base = offset = si->cluster_next; + /* + * Use percpu scan base for SSD to reduce lock contention on + * cluster and swap cache. For HDD, sequential access is more + * important. + */ + if (si->flags & SWP_SOLIDSTATE) + scan_base = this_cpu_read(*si->cluster_next_cpu); + else + scan_base = si->cluster_next; + offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) - goto checks; - else + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; - } - - if (unlikely(!si->cluster_nr--)) { + } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -848,7 +873,6 @@ checks: unlock_cluster(ci); swap_range_alloc(si, offset, 1); - si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -871,19 +895,33 @@ checks: if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto checks; - else - goto done; - } - /* non-ssd case */ - ++offset; - - /* non-ssd case, still more slots in cluster? */ - if (si->cluster_nr && !si->swap_map[offset]) { + } else if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } + /* + * Even if there's no free clusters available (fragmented), + * try to scan a little more quickly with lock held unless we + * have scanned too many slots already. + */ + if (!scanned_many) { + unsigned long scan_limit; + + if (offset < scan_base) + scan_limit = scan_base; + else + scan_limit = si->highest_bit; + for (; offset <= scan_limit && --latency_ration > 0; + offset++) { + if (!si->swap_map[offset]) + goto checks; + } + } + done: + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; @@ -901,6 +939,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } } offset = si->lowest_bit; @@ -916,6 +955,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } offset++; } @@ -1004,11 +1044,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) if (avail_pgs <= 0) goto noswap; - if (n_goal > SWAP_BATCH) - n_goal = SWAP_BATCH; - - if (n_goal > avail_pgs) - n_goal = avail_pgs; + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); @@ -1275,13 +1311,14 @@ unlock_out: } static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); + unsigned char usage; ci = lock_cluster_or_swap_info(p, offset); - usage = __swap_entry_free_locked(p, offset, usage); + usage = __swap_entry_free_locked(p, offset, 1); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); @@ -1316,7 +1353,7 @@ void swap_free(swp_entry_t entry) p = _swap_info_get(entry); if (p) - __swap_entry_free(p, entry, 1); + __swap_entry_free(p, entry); } /* @@ -1739,7 +1776,7 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { - count = __swap_entry_free(p, entry, 1); + count = __swap_entry_free(p, entry); if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), @@ -1854,7 +1891,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { struct page *swapcache; - struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret = 1; @@ -1864,15 +1900,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - ret = -ENOMEM; - goto out_nolock; - } - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg, false); ret = 0; goto out; } @@ -1884,10 +1913,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); @@ -1898,7 +1925,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, activate_page(page); out: pte_unmap_unlock(pte, ptl); -out_nolock: if (page != swapcache) { unlock_page(page); put_page(page); @@ -1937,10 +1963,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); swap_map = &si->swap_map[offset]; - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + page = lookup_swap_cache(entry, vma, addr); + if (!page) { + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + &vmf); + } if (!page) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; @@ -2070,7 +2100,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type, struct vm_area_struct *vma; int ret = 0; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma) { ret = unuse_vma(vma, type, frontswap, @@ -2080,7 +2110,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type, } cond_resched(); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return ret; } @@ -2650,6 +2680,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); @@ -2757,20 +2789,24 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; + unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } + bytes = si->pages << (PAGE_SHIFT - 10); + inuse = si->inuse_pages << (PAGE_SHIFT - 10); + file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", + seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", - si->pages << (PAGE_SHIFT - 10), - si->inuse_pages << (PAGE_SHIFT - 10), + bytes, bytes < 10000000 ? "\t" : "", + inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } @@ -3202,11 +3238,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; + p->cluster_next_cpu = alloc_percpu(unsigned int); + if (!p->cluster_next_cpu) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } /* * select a random position to start with to help wear leveling * SSD */ - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + for_each_possible_cpu(cpu) { + per_cpu(*p->cluster_next_cpu, cpu) = + 1 + prandom_u32_max(p->highest_bit); + } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), @@ -3322,6 +3366,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); @@ -3654,7 +3700,7 @@ static bool swap_count_continued(struct swap_info_struct *si, spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; - page = list_entry(head->lru.next, struct page, lru); + page = list_next_entry(head, lru); map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ @@ -3666,13 +3712,13 @@ static bool swap_count_continued(struct swap_info_struct *si, */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; @@ -3682,12 +3728,10 @@ init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = true; /* incremented */ @@ -3698,7 +3742,7 @@ init_map: *map = 0; /* we didn't zero the page */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } @@ -3707,13 +3751,11 @@ init_map: *map = 0; /* we didn't zero the page */ if (*map == 0) count = 0; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = count == COUNT_CONTINUED; } @@ -3745,11 +3787,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, - gfp_t gfp_mask) +void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; - if (!(gfp_mask & __GFP_IO) || !memcg) + int nid = page_to_nid(page); + + if (!(gfp_mask & __GFP_IO)) return; if (!blk_cgroup_congested()) @@ -3763,11 +3806,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], - avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], + avail_lists[nid]) { if (si->bdev) { - blkcg_schedule_throttle(bdev_get_queue(si->bdev), - true); + blkcg_schedule_throttle(bdev_get_queue(si->bdev), true); break; } } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 512576e171ce..b80419320c7d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -56,7 +56,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep, bool wp_copy) { - struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; void *page_kaddr; @@ -77,7 +76,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, PAGE_SIZE); kunmap_atomic(page_kaddr); - /* fallback to copy_from_user outside mmap_sem */ + /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; *pagep = page; @@ -97,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) + if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL)) goto out_release; _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); @@ -124,7 +123,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -138,7 +136,6 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg, false); out_release: put_page(page); goto out; @@ -203,7 +200,7 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) #ifdef CONFIG_HUGETLB_PAGE /* * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is - * called with mmap_sem held, it will release mmap_sem before returning. + * called with mmap_lock held, it will release mmap_lock before returning. */ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, @@ -231,7 +228,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, * feature is not supported. */ if (zeropage) { - up_read(&dst_mm->mmap_sem); + mmap_read_unlock(dst_mm); return -EINVAL; } @@ -250,7 +247,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, retry: /* - * On routine entry dst_vma is set. If we had to drop mmap_sem and + * On routine entry dst_vma is set. If we had to drop mmap_lock and * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { @@ -318,7 +315,7 @@ retry: cond_resched(); if (unlikely(err == -ENOENT)) { - up_read(&dst_mm->mmap_sem); + mmap_read_unlock(dst_mm); BUG_ON(!page); err = copy_huge_page_from_user(page, @@ -329,7 +326,7 @@ retry: err = -EFAULT; goto out; } - down_read(&dst_mm->mmap_sem); + mmap_read_lock(dst_mm); dst_vma = NULL; goto retry; @@ -349,7 +346,7 @@ retry: } out_unlock: - up_read(&dst_mm->mmap_sem); + mmap_read_unlock(dst_mm); out: if (page) { /* @@ -360,7 +357,7 @@ out: * private and shared mappings. See the routine * restore_reserve_on_error for details. Unfortunately, we * can not call restore_reserve_on_error now as it would - * require holding mmap_sem. + * require holding mmap_lock. * * If a reservation for the page existed in the reservation * map of a private mapping, the map was modified to indicate @@ -488,7 +485,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, copied = 0; page = NULL; retry: - down_read(&dst_mm->mmap_sem); + mmap_read_lock(dst_mm); /* * If memory mappings are changing because of non-cooperative @@ -586,7 +583,7 @@ retry: if (unlikely(err == -ENOENT)) { void *page_kaddr; - up_read(&dst_mm->mmap_sem); + mmap_read_unlock(dst_mm); BUG_ON(!page); page_kaddr = kmap(page); @@ -615,7 +612,7 @@ retry: } out_unlock: - up_read(&dst_mm->mmap_sem); + mmap_read_unlock(dst_mm); out: if (page) put_page(page); @@ -655,7 +652,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, /* Does the address range wrap, or is the span zero-sized? */ BUG_ON(start + len <= start); - down_read(&dst_mm->mmap_sem); + mmap_read_lock(dst_mm); /* * If memory mappings are changing because of non-cooperative @@ -689,6 +686,6 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, err = 0; out_unlock: - up_read(&dst_mm->mmap_sem); + mmap_read_unlock(dst_mm); return err; } diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..c63c8e47be57 100644 --- a/mm/util.c +++ b/mm/util.c @@ -425,7 +425,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped * * Assumes @task and @mm are valid (i.e. at least one reference on each), and - * that mmap_sem is held as writer. + * that mmap_lock is held as writer. * * Return: * * 0 on success @@ -437,7 +437,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, unsigned long locked_vm, limit; int ret = 0; - lockdep_assert_held_write(&mm->mmap_sem); + mmap_assert_write_locked(mm); locked_vm = mm->locked_vm; if (inc) { @@ -481,10 +481,10 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) if (pages == 0 || !mm) return 0; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); ret = __account_locked_vm(mm, pages, inc, current, capable(CAP_IPC_LOCK)); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } @@ -501,11 +501,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = security_mmap_file(file, prot, flag); if (!ret) { - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate, &uf); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); @@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; - return __vmalloc_node_flags_caller(size, node, flags, + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); @@ -604,6 +604,24 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + static inline void *__page_rmapping(struct page *page) { unsigned long mapping; @@ -717,9 +735,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -729,9 +746,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, return ret; } -int overcommit_kbytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -798,10 +814,6 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), - "memory commitment underflow"); - vm_acct_memory(pages); /* diff --git a/mm/vmacache.c b/mm/vmacache.c index cdc32a3b02fa..01a6e6688ec1 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -6,7 +6,6 @@ #include <linux/sched/task.h> #include <linux/mm.h> #include <linux/vmacache.h> -#include <asm/pgtable.h> /* * Hash based on the pmd of addr if configured with MMU, which provides a good @@ -25,8 +24,8 @@ * task's vmacache pertains to a different mm (ie, its own). There is * nothing we can do here. * - * Also handle the case where a kernel thread has adopted this mm via use_mm(). - * That kernel thread's vmacache is not applicable to this mm. + * Also handle the case where a kernel thread has adopted this mm via + * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm. */ static inline bool vmacache_valid_mm(struct mm_struct *mm) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 399f219544f7..3091c2ca60df 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -34,6 +34,7 @@ #include <linux/llist.h> #include <linux/bitops.h> #include <linux/rbtree_augmented.h> +#include <linux/overflow.h> #include <linux/uaccess.h> #include <asm/tlbflush.h> @@ -68,7 +69,8 @@ static void free_work(struct work_struct *w) /*** Page table manipulation functions ***/ -static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -77,73 +79,118 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; } -static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; + int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_clear_huge(pmd)) + + cleared = pmd_clear_huge(pmd); + if (cleared || pmd_bad(*pmd)) + *mask |= PGTBL_PMD_MODIFIED; + + if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; - vunmap_pte_range(pmd, addr, next); + vunmap_pte_range(pmd, addr, next, mask); } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; + int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_clear_huge(pud)) + + cleared = pud_clear_huge(pud); + if (cleared || pud_bad(*pud)) + *mask |= PGTBL_PUD_MODIFIED; + + if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; - vunmap_pmd_range(pud, addr, next); + vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } -static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; + int cleared; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_clear_huge(p4d)) + + cleared = p4d_clear_huge(p4d); + if (cleared || p4d_bad(*p4d)) + *mask |= PGTBL_P4D_MODIFIED; + + if (cleared) continue; if (p4d_none_or_clear_bad(p4d)) continue; - vunmap_pud_range(p4d, addr, next); + vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } -static void vunmap_page_range(unsigned long addr, unsigned long end) +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @start: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify + * should have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this + * function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) { - pgd_t *pgd; + unsigned long end = start + size; unsigned long next; + pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); + start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_p4d_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -152,7 +199,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, * callers keep track of where we're up to. */ - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -165,94 +212,117 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -/* - * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and - * will have pfns corresponding to the "pages" array. +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map * - * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * 0 on success, -errno on failure. */ -static int vmap_page_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) { - pgd_t *pgd; + unsigned long start = addr; + unsigned long end = addr + size; unsigned long next; - unsigned long addr = start; + pgd_t *pgd; int err = 0; int nr = 0; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); - return nr; + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return 0; } -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret; - ret = vmap_page_range_noflush(start, end, prot, pages); - flush_cache_vmap(start, end); + ret = map_kernel_range_noflush(start, size, prot, pages); + flush_cache_vmap(start, start + size); return ret; } @@ -1222,14 +1292,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); /* - * Clear the pagetable entries of a given vmap_area - */ -static void unmap_vmap_area(struct vmap_area *va) -{ - vunmap_page_range(va->va_start, va->va_end); -} - -/* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * @@ -1292,12 +1354,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) return false; /* - * First make sure the mappings are removed from all page-tables - * before they are freed. - */ - vmalloc_sync_unmappings(); - - /* * TODO: to calculate a flush range without looping. * The list can be up to lazy_max_pages() elements. */ @@ -1390,7 +1446,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_vmap_area(va); + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1664,7 +1720,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) return vaddr; } -static void vb_free(const void *addr, unsigned long size) +static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned long vb_idx; @@ -1674,24 +1730,22 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); - flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + flush_cache_vunmap(addr, addr + size); order = get_order(size); - offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); - offset >>= PAGE_SHIFT; + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; - vb_idx = addr_to_vb_idx((unsigned long)addr); + vb_idx = addr_to_vb_idx(addr); rcu_read_lock(); vb = radix_tree_lookup(&vmap_block_tree, vb_idx); rcu_read_unlock(); BUG_ON(!vb); - vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + unmap_kernel_range_noflush(addr, size); if (debug_pagealloc_enabled_static()) - flush_tlb_kernel_range((unsigned long)addr, - (unsigned long)addr + size); + flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); @@ -1791,7 +1845,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); - vb_free(mem, size); + vb_free(addr, size); return; } @@ -1818,7 +1872,7 @@ EXPORT_SYMBOL(vm_unmap_ram); * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; @@ -1842,7 +1896,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro kasan_unpoison_vmalloc(mem, size); - if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } @@ -1987,51 +2041,6 @@ void __init vmalloc_init(void) } /** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vmap() on to-be-mapped areas - * before calling this function. - * - * RETURNS: - * The number of pages mapped on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_page_range_noflush(addr, addr + size, prot, pages); -} - -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vunmap() on to-be-mapped areas - * before calling this function and flush_tlb_kernel_range() after. - */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ - vunmap_page_range(addr, addr + size); -} -EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); - -/** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB * @addr: start of the VM area to unmap * @size: size of the VM area to unmap @@ -2044,22 +2053,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) unsigned long end = addr + size; flush_cache_vunmap(addr, end); - vunmap_page_range(addr, end); + unmap_kernel_range_noflush(addr, size); flush_tlb_kernel_range(addr, end); } -EXPORT_SYMBOL_GPL(unmap_kernel_range); - -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + get_vm_area_size(area); - int err; - - err = vmap_page_range(addr, end, prot, pages); - - return err > 0 ? 0 : err; -} -EXPORT_SYMBOL_GPL(map_vm_area); static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) @@ -2127,14 +2123,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return area; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, __builtin_return_address(0)); -} -EXPORT_SYMBOL_GPL(__get_vm_area); - struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) @@ -2329,7 +2317,7 @@ static inline void __vfree_deferred(const void *addr) * Use raw_cpu_ptr() because this can be called from preemptible * context. Preemption is absolutely fine here, because the llist_add() * implementation is lockless, so it works even if we are adding to - * nother cpu's list. schedule_work() should be fine with this too. + * another cpu's list. schedule_work() should be fine with this too. */ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); @@ -2440,7 +2428,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2449,9 +2438,6 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -2469,7 +2455,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - PAGE_KERNEL, node, area->caller); + node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2503,8 +2489,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr; fail: @@ -2572,27 +2560,16 @@ fail: return NULL; } -/* - * This is only for performance analysis of vmalloc and stress purpose. - * It is required by vmalloc test module, therefore do not use it other - * than that. - */ -#ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node_range); -#endif - /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level allocator with + * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported @@ -2602,35 +2579,28 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * * Return: pointer to the allocated memory or %NULL on error */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller) +void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, - gfp_mask, prot, 0, node, caller); + gfp_mask, PAGE_KERNEL, 0, node, caller); } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node); +#endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); -static inline void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, - node, __builtin_return_address(0)); -} - - -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) -{ - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); -} - /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -2645,8 +2615,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL); + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -2665,8 +2635,8 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); @@ -2703,8 +2673,8 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, - node, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -2717,39 +2687,16 @@ EXPORT_SYMBOL(vmalloc_node); * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * - * For tight control over page level allocator and protection flags - * use __vmalloc_node() instead. - * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) { - return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); /** - * vmalloc_user_node_flags - allocate memory for userspace on a specific node - * @size: allocation size - * @node: numa node - * @flags: flags for the page level allocator - * - * The resulting memory area is zeroed so it can be mapped to userspace - * without leaking data. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, - flags | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, node, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - -/** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size * @@ -2792,8 +2739,8 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -3054,6 +3001,7 @@ finished: * @vma: vma to cover * @uaddr: target user address to start at * @kaddr: virtual address of vmalloc kernel memory + * @pgoff: offset from @kaddr to start at * @size: size of map area * * Returns: 0 for success, -Exxx on failure @@ -3066,9 +3014,15 @@ finished: * Similar to remap_pfn_range() (see mm/memory.c) */ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, - void *kaddr, unsigned long size) + void *kaddr, unsigned long pgoff, + unsigned long size) { struct vm_struct *area; + unsigned long off; + unsigned long end_index; + + if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) + return -EINVAL; size = PAGE_ALIGN(size); @@ -3082,8 +3036,10 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) return -EINVAL; - if (kaddr + size > area->addr + get_vm_area_size(area)) + if (check_add_overflow(size, off, &end_index) || + end_index > get_vm_area_size(area)) return -EINVAL; + kaddr += off; do { struct page *page = vmalloc_to_page(kaddr); @@ -3122,26 +3078,11 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff) { return remap_vmalloc_range_partial(vma, vma->vm_start, - addr + (pgoff << PAGE_SHIFT), + addr, pgoff, vma->vm_end - vma->vm_start); } EXPORT_SYMBOL(remap_vmalloc_range); -/* - * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose - * not to have one. - * - * The purpose of this function is to make sure the vmalloc area - * mappings are identical in all page-tables in the system. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; diff --git a/mm/vmscan.c b/mm/vmscan.c index b06868fc4926..b6d84326bdf2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -79,6 +79,12 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + /* Can active pages be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 @@ -161,7 +167,7 @@ struct scan_control { #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 200. Higher means more swappy. */ int vm_swappiness = 60; /* @@ -676,7 +682,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, freed += ret; /* * Bail out if someone want to register a new shrinker to - * prevent the regsitration from being stalled for long periods + * prevent the registration from being stalled for long periods * by parallel ongoing shrinking. */ if (rwsem_is_contended(&shrinker_rwsem)) { @@ -1066,17 +1072,17 @@ static void page_check_dirty_writeback(struct page *page, /* * shrink_page_list() returns the number of reclaimed pages */ -static unsigned long shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - enum ttu_flags ttu_flags, - struct reclaim_stat *stat, - bool ignore_references) +static unsigned int shrink_page_list(struct list_head *page_list, + struct pglist_data *pgdat, + struct scan_control *sc, + enum ttu_flags ttu_flags, + struct reclaim_stat *stat, + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); - unsigned nr_reclaimed = 0; - unsigned pgactivate = 0; + unsigned int nr_reclaimed = 0; + unsigned int pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1295,11 +1301,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page)) { enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { stat->nr_unmap_fail += nr_pages; + if (!was_swapbacked && PageSwapBacked(page)) + stat->nr_lazyfree_fail += nr_pages; goto activate_locked; } } @@ -1349,6 +1359,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: + stat->nr_pageout += hpage_nr_pages(page); + if (PageWriteback(page)) goto keep; if (PageDirty(page)) @@ -1438,7 +1450,7 @@ free_it: * appear not as the counts should be low */ if (unlikely(PageTransHuge(page))) - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); else list_add(&page->lru, &free_pages); continue; @@ -1483,7 +1495,7 @@ keep: return nr_reclaimed; } -unsigned long reclaim_clean_pages_from_list(struct zone *zone, +unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *page_list) { struct scan_control sc = { @@ -1491,8 +1503,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - struct reclaim_stat dummy_stat; - unsigned long ret; + struct reclaim_stat stat; + unsigned int nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1504,11 +1516,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } } - ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, &dummy_stat, true); + nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); - return ret; + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -stat.nr_lazyfree_fail); + return nr_reclaimed; } /* @@ -1591,7 +1613,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) /* * Update LRU sizes after isolating pages. The LRU size updates must - * be complete before mem_cgroup_update_lru_size due to a santity check. + * be complete before mem_cgroup_update_lru_size due to a sanity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, enum lru_list lru, unsigned long *nr_zone_taken) @@ -1602,10 +1624,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, if (!nr_zone_taken[zid]) continue; - __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); -#ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); -#endif + update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); } } @@ -1625,7 +1644,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session - * @mode: One of the LRU isolation modes * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. @@ -1860,7 +1878,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - (*get_compound_page_dtor(page))(page); + destroy_compound_page(page); spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); @@ -1879,13 +1897,13 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, /* * If a kernel thread (such as nfsd for loop-back mounts) services - * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. * In that case we should only throttle if the backing device it is * writing to is congested. In other cases it is safe to throttle. */ static int current_may_throttle(void) { - return !(current->flags & PF_LESS_THROTTLE) || + return !(current->flags & PF_LOCAL_THROTTLE) || current->backing_dev_info == NULL || bdi_write_congested(current->backing_dev_info); } @@ -1900,13 +1918,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, { LIST_HEAD(page_list); unsigned long nr_scanned; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; unsigned long nr_taken; struct reclaim_stat stat; - int file = is_file_lru(lru); + bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { @@ -1930,12 +1947,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + __count_vm_events(PGSCAN_ANON + file, nr_scanned); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) @@ -1946,16 +1963,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); + move_pages_to_lru(lruvec, &page_list); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + lru_note_cost(lruvec, file, stat.nr_pageout); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); - reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; - reclaim_stat->recent_rotated[1] += stat.nr_activate[1]; - - move_pages_to_lru(lruvec, &page_list); - - __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&pgdat->lru_lock); @@ -2002,7 +2018,6 @@ static void shrink_active_list(unsigned long nr_to_scan, LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; unsigned nr_deactivate, nr_activate; unsigned nr_rotated = 0; int file = is_file_lru(lru); @@ -2016,7 +2031,6 @@ static void shrink_active_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); @@ -2043,7 +2057,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (page_referenced(page, 0, sc->target_mem_cgroup, &vm_flags)) { - nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -2054,6 +2067,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * so we ignore them here. */ if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { + nr_rotated += hpage_nr_pages(page); list_add(&page->lru, &l_active); continue; } @@ -2068,13 +2082,6 @@ static void shrink_active_list(unsigned long nr_to_scan, * Move pages back to the lru list. */ spin_lock_irq(&pgdat->lru_lock); - /* - * Count referenced pages from currently used mappings as rotated, - * even though only some of them are actually re-activated. This - * helps balance scan pressure between file and anonymous pages in - * get_scan_count. - */ - reclaim_stat->recent_rotated[file] += nr_rotated; nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2096,7 +2103,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long reclaim_pages(struct list_head *page_list) { int nid = NUMA_NO_NODE; - unsigned long nr_reclaimed = 0; + unsigned int nr_reclaimed = 0; LIST_HEAD(node_page_list); struct reclaim_stat dummy_stat; struct page *page; @@ -2230,14 +2237,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { struct mem_cgroup *memcg = lruvec_memcg(lruvec); + unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file; unsigned long ap, fp; enum lru_list lru; @@ -2287,57 +2291,35 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } scan_balance = SCAN_FRACT; - - /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = swappiness; - file_prio = 200 - anon_prio; - /* - * OK, so we have swap space and a fair amount of page cache - * pages. We use the recently rotated / recently scanned - * ratios to determine how valuable each cache is. + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). * - * Because workloads change over time (and to avoid overflow) - * we keep these statistics as a floating average, which ends - * up weighing recent references more than old ones. + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. * - * anon in [0], file in [1] + * With swappiness at 100, anon and file have equal IO cost. */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; - anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); - file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; - spin_lock_irq(&pgdat->lru_lock); - if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - reclaim_stat->recent_scanned[0] /= 2; - reclaim_stat->recent_rotated[0] /= 2; - } - - if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - reclaim_stat->recent_scanned[1] /= 2; - reclaim_stat->recent_rotated[1] /= 2; - } - - /* - * The amount of pressure on anon vs file pages is inversely - * proportional to the fraction of recently scanned pages on - * each list that were recently referenced and in active use. - */ - ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); - ap /= reclaim_stat->recent_rotated[0] + 1; - - fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); - fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&pgdat->lru_lock); + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; fraction[0] = ap; fraction[1] = fp; - denominator = ap + fp + 1; + denominator = ap + fp; out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); @@ -2389,7 +2371,7 @@ out: /* * Minimally target SWAP_CLUSTER_MAX pages to keep - * reclaim moving forwards, avoiding decremeting + * reclaim moving forwards, avoiding decrementing * sc->priority further than desirable. */ scan = max(scan, SWAP_CLUSTER_MAX); @@ -2567,7 +2549,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) * Reclaim/compaction is used for high-order allocation requests. It reclaims * order-0 pages before compacting the zone. should_continue_reclaim() returns * true if more pages should be reclaimed such that when the page allocator - * calls try_to_compact_zone() that it will have enough free pages to succeed. + * calls try_to_compact_pages() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, @@ -2698,6 +2680,14 @@ again: nr_scanned = sc->nr_scanned; /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&pgdat->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&pgdat->lru_lock); + + /* * Target desirable inactive:active list ratios for the anon * and file LRU lists. */ @@ -3132,8 +3122,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL) - WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3386,7 +3376,7 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) { int i; struct zone *zone; @@ -3398,7 +3388,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) * start prematurely when there is no boosting and a lower * zone is balanced. */ - for (i = classzone_idx; i >= 0; i--) { + for (i = highest_zoneidx; i >= 0; i--) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3412,9 +3402,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) /* * Returns true if there is an eligible zone balanced for the request order - * and classzone_idx + * and highest_zoneidx */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long mark = -1; @@ -3424,19 +3414,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; mark = high_wmark_pages(zone); - if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } /* - * If a node has no populated zone within classzone_idx, it does not + * If a node has no populated zone within highest_zoneidx, it does not * need balancing by definition. This can happen if a zone-restricted * allocation tries to wake a remote kswapd. */ @@ -3462,7 +3452,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, + int highest_zoneidx) { /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3484,7 +3475,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - if (pgdat_balanced(pgdat, order, classzone_idx)) { + if (pgdat_balanced(pgdat, order, highest_zoneidx)) { clear_pgdat_congested(pgdat); return true; } @@ -3548,7 +3539,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * or lower is eligible for reclaim until at least one usable zone is * balanced. */ -static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; @@ -3576,7 +3567,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { zone = pgdat->node_zones + i; if (!managed_zone(zone)) continue; @@ -3594,7 +3585,7 @@ restart: bool balanced; bool ret; - sc.reclaim_idx = classzone_idx; + sc.reclaim_idx = highest_zoneidx; /* * If the number of buffer_heads exceeds the maximum allowed @@ -3624,7 +3615,7 @@ restart: * on the grounds that the normal reclaim should be enough to * re-evaluate if boosting is required when kswapd next wakes. */ - balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); if (!balanced && nr_boost_reclaim) { nr_boost_reclaim = 0; goto restart; @@ -3724,7 +3715,7 @@ out: if (boosted) { unsigned long flags; - for (i = 0; i <= classzone_idx; i++) { + for (i = 0; i <= highest_zoneidx; i++) { if (!zone_boosts[i]) continue; @@ -3739,7 +3730,7 @@ out: * As there is now likely space, wakeup kcompact to defragment * pageblocks. */ - wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); } snapshot_refaults(NULL, pgdat); @@ -3757,22 +3748,22 @@ out: } /* - * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be - * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not - * a valid index then either kswapd runs for first time or kswapd couldn't sleep - * after previous reclaim attempt (node is still unbalanced). In that case - * return the zone index of the previous kswapd reclaim cycle. + * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to + * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is + * not a valid index then either kswapd runs for first time or kswapd couldn't + * sleep after previous reclaim attempt (node is still unbalanced). In that + * case return the zone index of the previous kswapd reclaim cycle. */ -static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, - enum zone_type prev_classzone_idx) +static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, + enum zone_type prev_highest_zoneidx) { - enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx; + return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, - unsigned int classzone_idx) + unsigned int highest_zoneidx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3789,7 +3780,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * eligible zone balanced that it's also unlikely that compaction will * succeed. */ - if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3802,18 +3793,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, alloc_order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); remaining = schedule_timeout(HZ/10); /* - * If woken prematurely then reset kswapd_classzone_idx and + * If woken prematurely then reset kswapd_highest_zoneidx and * order. The values will either be from a wakeup request or * the previous request that slept prematurely. */ if (remaining) { - WRITE_ONCE(pgdat->kswapd_classzone_idx, - kswapd_classzone_idx(pgdat, classzone_idx)); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, + kswapd_highest_zoneidx(pgdat, + highest_zoneidx)); if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) WRITE_ONCE(pgdat->kswapd_order, reclaim_order); @@ -3828,7 +3820,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * go fully to sleep until explicitly woken up. */ if (!remaining && - prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { + prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3870,7 +3862,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; - unsigned int classzone_idx = MAX_NR_ZONES - 1; + unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -3894,22 +3886,24 @@ static int kswapd(void *p) set_freezable(); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); for ( ; ; ) { bool ret; alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, - classzone_idx); + highest_zoneidx); - /* Read the new order and classzone_idx */ + /* Read the new order and highest_zoneidx */ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); - WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); ret = try_to_freeze(); if (kthread_should_stop()) @@ -3930,9 +3924,10 @@ kswapd_try_sleep: * but kcompactd is woken to compact for the original * request (alloc_order). */ - trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); - reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + reclaim_order = balance_pgdat(pgdat, alloc_order, + highest_zoneidx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3950,7 +3945,7 @@ kswapd_try_sleep: * needed. */ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, - enum zone_type classzone_idx) + enum zone_type highest_zoneidx) { pg_data_t *pgdat; enum zone_type curr_idx; @@ -3962,10 +3957,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; pgdat = zone->zone_pgdat; - curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); - if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx) - WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx); + if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); if (READ_ONCE(pgdat->kswapd_order) < order) WRITE_ONCE(pgdat->kswapd_order, order); @@ -3975,8 +3970,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - (pgdat_balanced(pgdat, order, classzone_idx) && - !pgdat_watermark_boosted(pgdat, classzone_idx))) { + (pgdat_balanced(pgdat, order, highest_zoneidx) && + !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd @@ -3985,11 +3980,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, * ratelimit its work. */ if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); return; } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..3fb23a21f6dd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -76,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -1108,7 +1108,7 @@ int fragmentation_index(struct zone *zone, unsigned int order) TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* enum zone_stat_item countes */ + /* enum zone_stat_item counters */ "nr_free_pages", "nr_zone_inactive_anon", "nr_zone_active_anon", @@ -1119,6 +1119,9 @@ const char * const vmstat_text[] = { "nr_mlock", "nr_page_table_pages", "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack", +#endif "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", @@ -1162,7 +1165,6 @@ const char * const vmstat_text[] = { "nr_file_hugepages", "nr_file_pmdmapped", "nr_anon_transparent_hugepages", - "nr_unstable", "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", @@ -1201,6 +1203,10 @@ const char * const vmstat_text[] = { "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", + "pgscan_anon", + "pgscan_file", + "pgsteal_anon", + "pgsteal_file", #ifdef CONFIG_NUMA "zone_reclaim_failed", @@ -1590,6 +1596,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->present_pages, zone_managed_pages(zone)); + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); @@ -1597,12 +1609,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_putc(m, ')'); - /* If unpopulated, no other information is useful */ - if (!populated_zone(zone)) { - seq_putc(m, '\n'); - return; - } - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", zone_stat_name(i), zone_page_state(zone, i)); @@ -1723,6 +1729,14 @@ static int vmstat_show(struct seq_file *m, void *arg) seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } return 0; } @@ -1751,7 +1765,7 @@ static void refresh_vm_stats(struct work_struct *work) } int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; @@ -2055,24 +2069,14 @@ static int unusable_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations unusable_op = { +static const struct seq_operations unusable_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = unusable_show, }; -static int unusable_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &unusable_op); -} - -static const struct file_operations unusable_file_ops = { - .open = unusable_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(unusable); static void extfrag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) @@ -2107,24 +2111,14 @@ static int extfrag_show(struct seq_file *m, void *arg) return 0; } -static const struct seq_operations extfrag_op = { +static const struct seq_operations extfrag_sops = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = extfrag_show, }; -static int extfrag_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &extfrag_op); -} - -static const struct file_operations extfrag_file_ops = { - .open = extfrag_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(extfrag); static int __init extfrag_debug_init(void) { @@ -2133,10 +2127,10 @@ static int __init extfrag_debug_init(void) extfrag_debug_root = debugfs_create_dir("extfrag", NULL); debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, - &unusable_file_ops); + &unusable_fops); debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, - &extfrag_file_ops); + &extfrag_fops); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index 474186b76ced..d481ea452eeb 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -277,8 +277,8 @@ void workingset_refault(struct page *page, void *shadow) struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; + unsigned long workingset_size; struct pglist_data *pgdat; - unsigned long active_file; struct mem_cgroup *memcg; unsigned long eviction; struct lruvec *lruvec; @@ -310,7 +310,6 @@ void workingset_refault(struct page *page, void *shadow) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->inactive_age); - active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); /* * Calculate the refault distance @@ -345,10 +344,18 @@ void workingset_refault(struct page *page, void *shadow) /* * Compare the distance to the existing workingset size. We - * don't act on pages that couldn't stay resident even if all - * the memory was available to the page cache. + * don't activate pages that couldn't stay resident even if + * all the memory was available to the page cache. Whether + * cache can compete with anon or not depends on having swap. */ - if (refault_distance > active_file) + workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + workingset_size += lruvec_page_state(eviction_lruvec, + NR_INACTIVE_ANON); + workingset_size += lruvec_page_state(eviction_lruvec, + NR_ACTIVE_ANON); + } + if (refault_distance > workingset_size) goto out; SetPageActive(page); @@ -358,6 +365,10 @@ void workingset_refault(struct page *page, void *shadow) /* Page was active prior to eviction */ if (workingset) { SetPageWorkingset(page); + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); inc_lruvec_state(lruvec, WORKINGSET_RESTORE); } out: diff --git a/mm/z3fold.c b/mm/z3fold.c index 42f31c4b53ad..460b0feced26 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -43,6 +43,7 @@ #include <linux/spinlock.h> #include <linux/zpool.h> #include <linux/magic.h> +#include <linux/kmemleak.h> /* * NCHUNKS_ORDER determines the internal allocation granularity, effectively @@ -215,6 +216,8 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); if (slots) { + /* It will be freed separately in free_handle(). */ + kmemleak_not_leak(slots); memset(slots->slot, 0, sizeof(slots->slot)); slots->pool = (unsigned long)pool; rwlock_init(&slots->lock); @@ -318,16 +321,16 @@ static inline void free_handle(unsigned long handle) slots = handle_to_slots(handle); write_lock(&slots->lock); *(unsigned long *)handle = 0; - write_unlock(&slots->lock); - if (zhdr->slots == slots) + if (zhdr->slots == slots) { + write_unlock(&slots->lock); return; /* simple case, nothing else to do */ + } /* we are freeing a foreign handle if we are here */ zhdr->foreign_handles--; is_free = true; - read_lock(&slots->lock); if (!test_bit(HANDLES_ORPHANED, &slots->pool)) { - read_unlock(&slots->lock); + write_unlock(&slots->lock); return; } for (i = 0; i <= BUDDY_MASK; i++) { @@ -336,7 +339,7 @@ static inline void free_handle(unsigned long handle) break; } } - read_unlock(&slots->lock); + write_unlock(&slots->lock); if (is_free) { struct z3fold_pool *pool = slots_to_pool(slots); @@ -422,6 +425,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, zhdr->start_middle = 0; zhdr->cpu = -1; zhdr->foreign_handles = 0; + zhdr->mapped_count = 0; zhdr->slots = slots; zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); diff --git a/mm/zbud.c b/mm/zbud.c index de5dd4ddaa82..bc93aa4e46fc 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -243,7 +243,7 @@ static struct zbud_header *init_zbud_page(struct page *page) zhdr->last_chunks = 0; INIT_LIST_HEAD(&zhdr->buddy); INIT_LIST_HEAD(&zhdr->lru); - zhdr->under_reclaim = 0; + zhdr->under_reclaim = false; return zhdr; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2f836a2b993f..952a01e45c6a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -39,8 +39,8 @@ #include <linux/highmem.h> #include <linux/string.h> #include <linux/slab.h> +#include <linux/pgtable.h> #include <asm/tlbflush.h> -#include <asm/pgtable.h> #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/vmalloc.h> @@ -293,7 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING struct vm_struct *vm; /* vm area for mapping object that span pages */ #else char *vm_buf; /* copy buffer for objects that span pages */ @@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + unsigned long addr = (unsigned long)area->vm->addr; + + BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -1151,7 +1153,7 @@ static inline void __zs_unmap_object(struct mapping_area *area, unmap_kernel_range(addr, PAGE_SIZE * 2); } -#else /* CONFIG_PGTABLE_MAPPING */ +#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static inline int __zs_cpu_up(struct mapping_area *area) { @@ -1233,7 +1235,7 @@ out: pagefault_enable(); } -#endif /* CONFIG_PGTABLE_MAPPING */ +#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static int zs_cpu_prepare(unsigned int cpu) { |