diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 2 | ||||
-rw-r--r-- | mm/bootmem.c | 14 | ||||
-rw-r--r-- | mm/compaction.c | 205 | ||||
-rw-r--r-- | mm/debug.c | 5 | ||||
-rw-r--r-- | mm/filemap.c | 185 | ||||
-rw-r--r-- | mm/huge_memory.c | 83 | ||||
-rw-r--r-- | mm/hugetlb.c | 53 | ||||
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/ksm.c | 10 | ||||
-rw-r--r-- | mm/memblock.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 154 | ||||
-rw-r--r-- | mm/memory.c | 23 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 14 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mincore.c | 5 | ||||
-rw-r--r-- | mm/mlock.c | 52 | ||||
-rw-r--r-- | mm/mmap.c | 251 | ||||
-rw-r--r-- | mm/mprotect.c | 93 | ||||
-rw-r--r-- | mm/nobootmem.c | 20 | ||||
-rw-r--r-- | mm/oom_kill.c | 381 | ||||
-rw-r--r-- | mm/page-writeback.c | 60 | ||||
-rw-r--r-- | mm/page_alloc.c | 281 | ||||
-rw-r--r-- | mm/page_ext.c | 45 | ||||
-rw-r--r-- | mm/page_io.c | 7 | ||||
-rw-r--r-- | mm/page_isolation.c | 2 | ||||
-rw-r--r-- | mm/page_owner.c | 156 | ||||
-rw-r--r-- | mm/shmem.c | 119 | ||||
-rw-r--r-- | mm/slab.c | 114 | ||||
-rw-r--r-- | mm/slub.c | 65 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 14 | ||||
-rw-r--r-- | mm/swapfile.c | 137 | ||||
-rw-r--r-- | mm/vmacache.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 22 | ||||
-rw-r--r-- | mm/vmscan.c | 53 | ||||
-rw-r--r-- | mm/vmstat.c | 95 | ||||
-rw-r--r-- | mm/workingset.c | 10 |
38 files changed, 1537 insertions, 1217 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 22f4cd96acb0..afcc550877ff 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -76,8 +76,6 @@ config PAGE_POISONING_ZERO no longer necessary to write zeros when GFP_ZERO is used on allocation. - Enabling page poisoning with this option will disable hibernation - If unsure, say N bool diff --git a/mm/bootmem.c b/mm/bootmem.c index 0aa7dda52402..a869f84f44d3 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -11,15 +11,12 @@ #include <linux/init.h> #include <linux/pfn.h> #include <linux/slab.h> -#include <linux/bootmem.h> #include <linux/export.h> #include <linux/kmemleak.h> #include <linux/range.h> -#include <linux/memblock.h> #include <linux/bug.h> #include <linux/io.h> - -#include <asm/processor.h> +#include <linux/bootmem.h> #include "internal.h" @@ -712,7 +709,7 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, void *ptr; if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); again: /* do not panic in alloc_bootmem_bdata() */ @@ -738,9 +735,6 @@ again: void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } @@ -812,10 +806,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, } -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif - /** * __alloc_bootmem_low - allocate low boot memory * @size: size of the request in bytes diff --git a/mm/compaction.c b/mm/compaction.c index 9affb2908304..0409a4ad6ea1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -997,8 +997,12 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #ifdef CONFIG_COMPACTION /* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct page *page) +static bool suitable_migration_target(struct compact_control *cc, + struct page *page) { + if (cc->ignore_block_suitable) + return true; + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1083,7 +1087,7 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Check the block is suitable for migration */ - if (!suitable_migration_target(page)) + if (!suitable_migration_target(cc, page)) continue; /* If isolation recently failed, do not retry */ @@ -1316,7 +1320,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ return COMPACT_CONTINUE; /* Compaction run is not finished if the watermark is not met */ - watermark = low_wmark_pages(zone); + watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, cc->alloc_flags)) @@ -1329,13 +1333,13 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[migratetype])) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; #ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && !list_empty(&area->free_list[MIGRATE_CMA])) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; #endif /* * Job done if allocation would steal freepages from @@ -1343,7 +1347,7 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ */ if (find_suitable_fallback(area, order, migratetype, true, &can_steal) != -1) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; } return COMPACT_NO_SUITABLE_PAGE; @@ -1367,7 +1371,7 @@ static enum compact_result compact_finished(struct zone *zone, * compaction_suitable: Is this suitable to run compaction on this zone now? * Returns * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_SUCCESS - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ static enum compact_result __compaction_suitable(struct zone *zone, int order, @@ -1375,46 +1379,41 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, int classzone_idx, unsigned long wmark_target) { - int fragindex; unsigned long watermark; if (is_via_compact_memory(order)) return COMPACT_CONTINUE; - watermark = low_wmark_pages(zone); + watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; /* * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. */ if (zone_watermark_ok(zone, order, watermark, classzone_idx, alloc_flags)) - return COMPACT_PARTIAL; + return COMPACT_SUCCESS; /* - * Watermarks for order-0 must be met for compaction. Note the 2UL. - * This is because during migration, copies of pages need to be - * allocated and for a short time, the footprint is higher + * Watermarks for order-0 must be met for compaction to be able to + * isolate free pages for migration targets. This means that the + * watermark and alloc_flags have to match, or be more pessimistic than + * the check in __isolate_free_page(). We don't use the direct + * compactor's alloc_flags, as they are not relevant for freepage + * isolation. We however do use the direct compactor's classzone_idx to + * skip over zones where lowmem reserves would prevent allocation even + * if compaction succeeds. + * For costly orders, we require low watermark instead of min for + * compaction to proceed to increase its chances. + * ALLOC_CMA is used, as pages in CMA pageblocks are considered + * suitable migration targets */ - watermark += (2UL << order); + watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? + low_wmark_pages(zone) : min_wmark_pages(zone); + watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - alloc_flags, wmark_target)) + ALLOC_CMA, wmark_target)) return COMPACT_SKIPPED; - /* - * fragmentation index determines if allocation failures are due to - * low memory or external fragmentation - * - * index of -1000 would imply allocations might succeed depending on - * watermarks, but we already failed the high-order watermark check - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - return COMPACT_NOT_SUITABLE_ZONE; - return COMPACT_CONTINUE; } @@ -1423,9 +1422,32 @@ enum compact_result compaction_suitable(struct zone *zone, int order, int classzone_idx) { enum compact_result ret; + int fragindex; ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, zone_page_state(zone, NR_FREE_PAGES)); + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1000 would imply allocations might succeed depending on + * watermarks, but we already failed the high-order watermark check + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. Also + * ignore fragindex for non-costly orders where the alternative to + * a successful reclaim/compaction is OOM. Fragindex and the + * vm.extfrag_threshold sysctl is meant as a heuristic to prevent + * excessive compaction for costly orders, but it should not be at the + * expense of system stability. + */ + if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + ret = COMPACT_NOT_SUITABLE_ZONE; + } + trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; @@ -1458,8 +1480,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, available += zone_page_state_snapshot(zone, NR_FREE_PAGES); compact_result = __compaction_suitable(zone, order, alloc_flags, ac_classzone_idx(ac), available); - if (compact_result != COMPACT_SKIPPED && - compact_result != COMPACT_NOT_SUITABLE_ZONE) + if (compact_result != COMPACT_SKIPPED) return true; } @@ -1477,7 +1498,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ - if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) + if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; /* huh, compaction_suitable is returning something unexpected */ @@ -1492,23 +1513,29 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro /* * Setup to move all movable pages to the end of the zone. Used cached - * information on where the scanners should start but check that it - * is initialised by ensuring the values are within zone boundaries. + * information on where the scanners should start (unless we explicitly + * want to compact the whole zone), but check that it is initialised + * by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; - cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { - cc->free_pfn = pageblock_start_pfn(end_pfn - 1); - zone->compact_cached_free_pfn = cc->free_pfn; - } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + if (cc->whole_zone) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; - zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; - } + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + } else { + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + } - if (cc->migrate_pfn == start_pfn) - cc->whole_zone = true; + if (cc->migrate_pfn == start_pfn) + cc->whole_zone = true; + } cc->last_migrated_pfn = 0; @@ -1638,6 +1665,9 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, .direct_compaction = true, + .whole_zone = (prio == MIN_COMPACT_PRIORITY), + .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), + .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1683,7 +1713,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, ac->nodemask) { enum compact_result status; - if (compaction_deferred(zone, order)) { + if (prio > MIN_COMPACT_PRIORITY + && compaction_deferred(zone, order)) { rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; } @@ -1692,9 +1723,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, alloc_flags, ac_classzone_idx(ac)); rc = max(status, rc); - /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags)) { + /* The allocation should succeed, stop compacting */ + if (status == COMPACT_SUCCESS) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller @@ -1730,10 +1760,18 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* Compact all zones within a node */ -static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) +static void compact_node(int nid) { + pg_data_t *pgdat = NODE_DATA(nid); int zoneid; struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .whole_zone = true, + }; + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { @@ -1741,60 +1779,19 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (!populated_zone(zone)) continue; - cc->nr_freepages = 0; - cc->nr_migratepages = 0; - cc->zone = zone; - INIT_LIST_HEAD(&cc->freepages); - INIT_LIST_HEAD(&cc->migratepages); - - /* - * When called via /proc/sys/vm/compact_memory - * this makes sure we compact the whole zone regardless of - * cached scanner positions. - */ - if (is_via_compact_memory(cc->order)) - __reset_isolation_suitable(zone); - - if (is_via_compact_memory(cc->order) || - !compaction_deferred(zone, cc->order)) - compact_zone(zone, cc); - - VM_BUG_ON(!list_empty(&cc->freepages)); - VM_BUG_ON(!list_empty(&cc->migratepages)); + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); - if (is_via_compact_memory(cc->order)) - continue; + compact_zone(zone, &cc); - if (zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0)) - compaction_defer_reset(zone, cc->order, false); + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); } } -void compact_pgdat(pg_data_t *pgdat, int order) -{ - struct compact_control cc = { - .order = order, - .mode = MIGRATE_ASYNC, - }; - - if (!order) - return; - - __compact_pgdat(pgdat, &cc); -} - -static void compact_node(int nid) -{ - struct compact_control cc = { - .order = -1, - .mode = MIGRATE_SYNC, - .ignore_skip_hint = true, - }; - - __compact_pgdat(NODE_DATA(nid), &cc); -} - /* Compact all nodes in the system */ static void compact_nodes(void) { @@ -1900,8 +1897,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) .ignore_skip_hint = true, }; - bool success = false; - trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); count_vm_event(KCOMPACTD_WAKE); @@ -1930,9 +1925,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) return; status = compact_zone(zone, &cc); - if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), - cc.classzone_idx, 0)) { - success = true; + if (status == COMPACT_SUCCESS) { compaction_defer_reset(zone, cc.order, false); } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* diff --git a/mm/debug.c b/mm/debug.c index 74c7cae4f683..9feb699c5d25 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,6 +42,11 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + /* + * Avoid VM_BUG_ON() in page_mapcount(). + * page->_mapcount space in struct page is used by sl[aou]b pages to + * encode own info. + */ int mapcount = PageSlab(page) ? 0 : page_mapcount(page); pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", diff --git a/mm/filemap.c b/mm/filemap.c index 8a287dfc5372..849f459ad078 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -110,36 +110,94 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) +{ + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, 0, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + + mapping->nrexceptional--; + if (!dax_mapping(mapping)) { + if (shadowp) + *shadowp = p; + if (node) + workingset_node_shadows_dec(node); + } else { + /* DAX can replace empty locked entry with a hole */ + WARN_ON_ONCE(p != + (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK)); + /* DAX accounts exceptional entries as normal pages */ + if (node) + workingset_node_pages_dec(node); + /* Wakeup waiters for exceptional entry lock */ + dax_wake_mapping_entry_waiter(mapping, page->index, + false); + } + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; +} + static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - struct radix_tree_node *node; int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - if (shadow) { - mapping->nrexceptional += nr; - /* - * Make sure the nrexceptional update is committed before - * the nrpages update so that final truncate racing - * with reclaim does not see both counters 0 at the - * same time and miss a shadow entry. - */ - smp_wmb(); - } - mapping->nrpages -= nr; - for (i = 0; i < nr; i++) { - node = radix_tree_replace_clear_tags(&mapping->page_tree, - page->index + i, shadow); + struct radix_tree_node *node; + void **slot; + + __radix_tree_lookup(&mapping->page_tree, page->index + i, + &node, &slot); + + radix_tree_clear_tags(&mapping->page_tree, node, slot); + if (!node) { VM_BUG_ON_PAGE(nr != 1, page); - return; + /* + * We need a node to properly account shadow + * entries. Don't plant any without. XXX + */ + shadow = NULL; } + radix_tree_replace_slot(slot, shadow); + + if (!node) + break; + workingset_node_pages_dec(node); if (shadow) workingset_node_shadows_inc(node); @@ -163,6 +221,18 @@ static void page_cache_tree_delete(struct address_space *mapping, &node->private_list); } } + + if (shadow) { + mapping->nrexceptional += nr; + /* + * Make sure the nrexceptional update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + mapping->nrpages -= nr; } /* @@ -561,9 +631,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(old, NULL); - error = radix_tree_insert(&mapping->page_tree, offset, new); + error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); - mapping->nrpages++; /* * hugetlb pages do not participate in page cache accounting. @@ -584,62 +653,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->page_tree, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); - if (!radix_tree_exceptional_entry(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (!dax_mapping(mapping)) { - if (shadowp) - *shadowp = p; - if (node) - workingset_node_shadows_dec(node); - } else { - /* DAX can replace empty locked entry with a hole */ - WARN_ON_ONCE(p != - (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | - RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); - /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, - false); - } - } - radix_tree_replace_slot(slot, page); - mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } - return 0; -} - static int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask, @@ -1674,6 +1687,10 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, unsigned int prev_offset; int error = 0; + if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + return -EINVAL; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + index = *ppos >> PAGE_SHIFT; prev_index = ra->prev_pos >> PAGE_SHIFT; prev_offset = ra->prev_pos & (PAGE_SIZE-1); @@ -1708,7 +1725,9 @@ find_page: * wait_on_page_locked is used to avoid unnecessarily * serialisations and why it's safe. */ - wait_on_page_locked_killable(page); + error = wait_on_page_locked_killable(page); + if (unlikely(error)) + goto readpage_error; if (PageUptodate(page)) goto page_ok; @@ -1910,17 +1929,19 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + struct iov_iter data = *iter; loff_t size; size = i_size_read(inode); retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, iocb->ki_pos + count - 1); - if (!retval) { - struct iov_iter data = *iter; - retval = mapping->a_ops->direct_IO(iocb, &data); - } + if (retval < 0) + goto out; - if (retval > 0) { + file_accessed(file); + + retval = mapping->a_ops->direct_IO(iocb, &data); + if (retval >= 0) { iocb->ki_pos += retval; iov_iter_advance(iter, retval); } @@ -1935,10 +1956,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * DAX files, so don't bother trying. */ if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || - IS_DAX(inode)) { - file_accessed(file); + IS_DAX(inode)) goto out; - } } retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 53ae6d00656a..cdcd25cb30fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; -struct page *get_huge_zero_page(void) +static struct page *get_huge_zero_page(void) { struct page *zero_page; retry: @@ -86,7 +86,7 @@ retry: return READ_ONCE(huge_zero_page); } -void put_huge_zero_page(void) +static void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -95,6 +95,26 @@ void put_huge_zero_page(void) BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } +struct page *mm_get_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + return READ_ONCE(huge_zero_page); + + if (!get_huge_zero_page()) + return NULL; + + if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); + + return READ_ONCE(huge_zero_page); +} + +void mm_put_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); +} + static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -469,6 +489,49 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } +unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, + loff_t off, unsigned long flags, unsigned long size) +{ + unsigned long addr; + loff_t off_end = off + len; + loff_t off_align = round_up(off, size); + unsigned long len_pad; + + if (off_end <= off_align || (off_end - off_align) < size) + return 0; + + len_pad = len + size; + if (len_pad < len || (off + len_pad) < off) + return 0; + + addr = current->mm->get_unmapped_area(filp, 0, len_pad, + off >> PAGE_SHIFT, flags); + if (IS_ERR_VALUE(addr)) + return 0; + + addr += (off - addr) & (size - 1); + return addr; +} + +unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + loff_t off = (loff_t)pgoff << PAGE_SHIFT; + + if (addr) + goto out; + if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) + goto out; + + addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); + if (addr) + return addr; + + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} +EXPORT_SYMBOL_GPL(thp_get_unmapped_area); + static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, gfp_t gfp) { @@ -601,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); @@ -623,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) } } else spin_unlock(fe->ptl); - if (!set) { + if (!set) pte_free(vma->vm_mm, pgtable); - put_huge_zero_page(); - } return ret; } gfp = alloc_hugepage_direct_gfpmask(vma); @@ -780,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * since we already have a zero page to copy. It just takes a * reference. */ - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(dst_mm); set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); ret = 0; @@ -1038,7 +1099,6 @@ alloc: update_mmu_cache_pmd(vma, fe->address, fe->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); page_remove_rmap(page, true); @@ -1165,7 +1225,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) } /* See similar comment in do_numa_page for explanation */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pmd_write(pmd)) flags |= TNF_NO_GROUP; /* @@ -1499,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - put_huge_zero_page(); } static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, @@ -1522,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); - if (is_huge_zero_pmd(_pmd)) - put_huge_zero_page(); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); @@ -1563,7 +1620,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); } else { - entry = mk_pte(page + i, vma->vm_page_prot); + entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); if (!write) entry = pte_wrprotect(entry); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87e11d8ad536..ec49d9ef1eef 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -567,13 +567,13 @@ retry: * appear as a "reserved" entry instead of simply dangling with incorrect * counts. */ -void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) +void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (restore_reserve && rsv_adjust) { + if (rsv_adjust) { struct hstate *h = hstate_inode(inode); hugetlb_acct_memory(h, 1); @@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \ +#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, @@ -1437,38 +1437,61 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, /* * Dissolve a given free hugepage into free buddy pages. This function does - * nothing for in-use (including surplus) hugepages. + * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the + * number of free hugepages would be reduced below the number of reserved + * hugepages. */ -static void dissolve_free_huge_page(struct page *page) +static int dissolve_free_huge_page(struct page *page) { + int rc = 0; + spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - list_del(&page->lru); + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); + if (h->free_huge_pages - h->resv_huge_pages == 0) { + rc = -EBUSY; + goto out; + } + list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; h->max_huge_pages--; - update_and_free_page(h, page); + update_and_free_page(h, head); } +out: spin_unlock(&hugetlb_lock); + return rc; } /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. - * Note that start_pfn should aligned with (minimum) hugepage size. + * Note that this will dissolve a free gigantic hugepage completely, if any + * part of it lies within the given range. + * Also note that if dissolve_free_huge_page() returns with an error, all + * free hugepages that were dissolved before that error are lost. */ -void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; + struct page *page; + int rc = 0; if (!hugepages_supported()) - return; + return rc; + + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + page = pfn_to_page(pfn); + if (PageHuge(page) && !page_count(page)) { + rc = dissolve_free_huge_page(page); + if (rc) + break; + } + } - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) - dissolve_free_huge_page(pfn_to_page(pfn)); + return rc; } /* diff --git a/mm/internal.h b/mm/internal.h index 1501304f87a4..537ac9951f5f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -178,8 +178,9 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ - bool whole_zone; /* Whole zone has been scanned */ + bool whole_zone; /* Whole zone should/has been scanned */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ @@ -283,7 +283,8 @@ static inline struct rmap_item *alloc_rmap_item(void) { struct rmap_item *rmap_item; - rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | + __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; @@ -298,7 +299,12 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) static inline struct stable_node *alloc_stable_node(void) { - return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); + /* + * The allocation can take too long with GFP_KERNEL when memory is under + * pressure, which may lead to hung task warnings. Adding __GFP_HIGH + * grants access to memory reserves, helping to avoid this problem. + */ + return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct stable_node *stable_node) diff --git a/mm/memblock.c b/mm/memblock.c index 483197ef613f..c8dfa430342b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1438,6 +1438,11 @@ phys_addr_t __init_memblock memblock_phys_mem_size(void) return memblock.memory.total_size; } +phys_addr_t __init_memblock memblock_reserved_size(void) +{ + return memblock.reserved.total_size; +} + phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) { unsigned long pages = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4be518d4e68a..ae052b5e3315 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -921,6 +921,43 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) iter = mem_cgroup_iter(NULL, iter, NULL)) /** + * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy + * @memcg: hierarchy root + * @fn: function to call for each task + * @arg: argument passed to @fn + * + * This function iterates over tasks attached to @memcg or to any of its + * descendants and calls @fn for each task. If @fn returns a non-zero + * value, the function breaks the iteration loop and returns the value. + * Otherwise, it will iterate over all tasks and return 0. + * + * This function must not be called for the root memory cgroup. + */ +int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) +{ + struct mem_cgroup *iter; + int ret = 0; + + BUG_ON(memcg == root_mem_cgroup); + + for_each_mem_cgroup_tree(iter, memcg) { + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&iter->css, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret = fn(task, arg); + css_task_iter_end(&it); + if (ret) { + mem_cgroup_iter_break(memcg, iter); + break; + } + } + return ret; +} + +/** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page @@ -1178,7 +1215,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) { unsigned long limit; @@ -1205,79 +1242,12 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, .gfp_mask = gfp_mask, .order = order, }; - struct mem_cgroup *iter; - unsigned long chosen_points = 0; - unsigned long totalpages; - unsigned int points = 0; - struct task_struct *chosen = NULL; + bool ret; mutex_lock(&oom_lock); - - /* - * If current has a pending SIGKILL or is exiting, then automatically - * select it. The goal is to allow it to allocate so that it may - * quickly exit and free its memory. - */ - if (task_will_free_mem(current)) { - mark_oom_victim(current); - wake_oom_reaper(current); - goto unlock; - } - - check_panic_on_oom(&oc, CONSTRAINT_MEMCG); - totalpages = mem_cgroup_get_limit(memcg) ? : 1; - for_each_mem_cgroup_tree(iter, memcg) { - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&iter->css, &it); - while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task)) { - case OOM_SCAN_SELECT: - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = ULONG_MAX; - get_task_struct(chosen); - /* fall through */ - case OOM_SCAN_CONTINUE: - continue; - case OOM_SCAN_ABORT: - css_task_iter_end(&it); - mem_cgroup_iter_break(memcg, iter); - if (chosen) - put_task_struct(chosen); - /* Set a dummy value to return "true". */ - chosen = (void *) 1; - goto unlock; - case OOM_SCAN_OK: - break; - }; - points = oom_badness(task, memcg, NULL, totalpages); - if (!points || points < chosen_points) - continue; - /* Prefer thread group leaders for display purposes */ - if (points == chosen_points && - thread_group_leader(chosen)) - continue; - - if (chosen) - put_task_struct(chosen); - chosen = task; - chosen_points = points; - get_task_struct(chosen); - } - css_task_iter_end(&it); - } - - if (chosen) { - points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, - "Memory cgroup out of memory"); - } -unlock: + ret = out_of_memory(&oc); mutex_unlock(&oom_lock); - return chosen; + return ret; } #if MAX_NUMNODES > 1 @@ -1600,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) if (!memcg) return false; - if (!handle || oom_killer_disabled) + if (!handle) goto cleanup; owait.memcg = memcg; @@ -2969,16 +2939,16 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation - * function is the last one to run. See sock_update_memcg() for - * details, and note that we don't mark any socket as belonging - * to this memcg until that flag is up. + * function is the last one to run. See mem_cgroup_sk_alloc() + * for details, and note that we don't mark any socket as + * belonging to this memcg until that flag is up. * * We need to do this, because static_keys will span multiple * sites, but we can't control their order. If we mark a socket * as accounted, but the accounting functions are not patched in * yet, we'll lose accounting. * - * We never race with the readers in sock_update_memcg(), + * We never race with the readers in mem_cgroup_sk_alloc(), * because when this value change, the code to process it is not * patched in yet. */ @@ -4092,11 +4062,13 @@ static DEFINE_IDR(mem_cgroup_idr); static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { + VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); atomic_add(n, &memcg->id.ref); } static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { + VM_BUG_ON(atomic_read(&memcg->id.ref) < n); if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4285,8 +4257,10 @@ fail: static int mem_cgroup_css_online(struct cgroup_subsys_state *css) { + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + /* Online state pins memcg ID, memcg ID pins CSS */ - mem_cgroup_id_get(mem_cgroup_from_css(css)); + atomic_set(&memcg->id.ref, 1); css_get(css); return 0; } @@ -4434,7 +4408,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ - page = find_get_page(swap_address_space(ent), ent.val); + page = find_get_page(swap_address_space(ent), swp_offset(ent)); if (do_memsw_account()) entry->val = ent.val; @@ -4472,7 +4446,8 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; - page = find_get_page(swap_address_space(swp), swp.val); + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); } } else page = find_get_page(mapping, pgoff); @@ -4707,7 +4682,8 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) .mm = mm, }; down_read(&mm->mmap_sem); - walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); + walk_page_range(0, mm->highest_vm_end, + &mem_cgroup_count_precharge_walk); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -4995,7 +4971,8 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); + walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } @@ -5674,11 +5651,15 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); -void sock_update_memcg(struct sock *sk) +void mem_cgroup_sk_alloc(struct sock *sk) { struct mem_cgroup *memcg; - /* Socket cloning can throw us here with sk_cgrp already + if (!mem_cgroup_sockets_enabled) + return; + + /* + * Socket cloning can throw us here with sk_memcg already * filled. It won't however, necessarily happen from * process context. So the test for root memcg given * the current task's memcg won't help us in this case. @@ -5703,12 +5684,11 @@ void sock_update_memcg(struct sock *sk) out: rcu_read_unlock(); } -EXPORT_SYMBOL(sock_update_memcg); -void sock_release_memcg(struct sock *sk) +void mem_cgroup_sk_free(struct sock *sk) { - WARN_ON(!sk->sk_memcg); - css_put(&sk->sk_memcg->css); + if (sk->sk_memcg) + css_put(&sk->sk_memcg->css); } /** diff --git a/mm/memory.c b/mm/memory.c index 793fe0f9841c..fc1987dfd8cc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1649,10 +1649,14 @@ EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { + pgprot_t pgprot = vma->vm_page_prot; + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; + if (track_pfn_insert(vma, &pgprot, pfn)) + return -EINVAL; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1670,9 +1674,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, vma->vm_page_prot); + return insert_pfn(vma, addr, pfn, pgprot); } EXPORT_SYMBOL(vm_insert_mixed); @@ -3395,7 +3399,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!(vma->vm_flags & VM_WRITE)) + if (!pte_write(pte)) flags |= TNF_NO_GROUP; /* @@ -3658,6 +3662,19 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } + /* + * This mm has been already reaped by the oom reaper and so the + * refault cannot be trusted in general. Anonymous refaults would + * lose data and give a zero page instead e.g. This is especially + * problem for use_mm() because regular tasks will just die and + * the corrupted data will not be visible anywhere while kthread + * will outlive the oom victim and potentially propagate the date + * further. + */ + if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR) + && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) + ret = VM_FAULT_SIGBUS; + return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b58906b6215c..962927309b6e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1555,8 +1555,8 @@ static struct page *new_node_page(struct page *page, unsigned long private, { gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; int nid = page_to_nid(page); - nodemask_t nmask = node_online_map; - struct page *new_page; + nodemask_t nmask = node_states[N_MEMORY]; + struct page *new_page = NULL; /* * TODO: allocate a destination hugepage from a nearest neighbor node, @@ -1567,14 +1567,14 @@ static struct page *new_node_page(struct page *page, unsigned long private, return alloc_huge_page_node(page_hstate(compound_head(page)), next_node_in(nid, nmask)); - if (nid != next_node_in(nid, nmask)) - node_clear(nid, nmask); + node_clear(nid, nmask); if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) gfp_mask |= __GFP_HIGHMEM; - new_page = __alloc_pages_nodemask(gfp_mask, 0, + if (!nodes_empty(nmask)) + new_page = __alloc_pages_nodemask(gfp_mask, 0, node_zonelist(nid, gfp_mask), &nmask); if (!new_page) new_page = __alloc_pages(gfp_mask, 0, @@ -1945,7 +1945,9 @@ repeat: * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. */ - dissolve_free_huge_pages(start_pfn, end_pfn); + ret = dissolve_free_huge_pages(start_pfn, end_pfn); + if (ret) + goto failed_removal; /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2da72a5b6ecc..ad1c96ac313c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1749,7 +1749,7 @@ unsigned int mempolicy_slab_node(void) */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(node)->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); return z->zone ? z->zone->node : node; diff --git a/mm/migrate.c b/mm/migrate.c index f7ee04a5ae27..99250aee1ac1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -234,7 +234,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto unlock; get_page(new); - pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*ptep)) pte = pte_mksoft_dirty(pte); diff --git a/mm/mincore.c b/mm/mincore.c index c0b5ba965200..bfb866435478 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swp), swp.val); + page = find_get_page(swap_address_space(swp), + swp_offset(swp)); } } else page = find_get_page(mapping, pgoff); @@ -150,7 +151,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { #ifdef CONFIG_SWAP *vec = mincore_page(swap_address_space(entry), - entry.val); + swp_offset(entry)); #else WARN_ON(1); *vec = 1; diff --git a/mm/mlock.c b/mm/mlock.c index 14645be06e30..145a4258ddbc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -516,6 +516,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int nr_pages; int ret = 0; int lock = !!(newflags & VM_LOCKED); + vm_flags_t old_flags = vma->vm_flags; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) @@ -550,6 +551,8 @@ success: nr_pages = (end - start) >> PAGE_SHIFT; if (!lock) nr_pages = -nr_pages; + else if (old_flags & VM_LOCKED) + nr_pages = 0; mm->locked_vm += nr_pages; /* @@ -617,6 +620,45 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return error; } +/* + * Go through vma areas and sum size of mlocked + * vma pages, as return value. + * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) + * is also counted. + * Return value: previously mlocked page counts + */ +static int count_mm_mlocked_page_nr(struct mm_struct *mm, + unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + int count = 0; + + if (mm == NULL) + mm = current->mm; + + vma = find_vma(mm, start); + if (vma == NULL) + vma = mm->mmap; + + for (; vma ; vma = vma->vm_next) { + if (start >= vma->vm_end) + continue; + if (start + len <= vma->vm_start) + break; + if (vma->vm_flags & VM_LOCKED) { + if (start > vma->vm_start) + count -= (start - vma->vm_start); + if (start + len < vma->vm_end) { + count += start + len - vma->vm_start; + break; + } + count += vma->vm_end - vma->vm_start; + } + } + + return count >> PAGE_SHIFT; +} + static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; @@ -639,6 +681,16 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla return -EINTR; locked += current->mm->locked_vm; + if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { + /* + * It is possible that the regions requested intersect with + * previously mlocked areas, that part area in "mm->locked_vm" + * should not be counted to new mlock increment count. So check + * and adjust locked count if necessary. + */ + locked -= count_mm_mlocked_page_nr(current->mm, + start, len); + } /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) diff --git a/mm/mmap.c b/mm/mmap.c index ca9d91bca0d6..1af87c14183d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,6 +88,11 @@ static void unmap_region(struct mm_struct *mm, * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * + * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and + * MAP_PRIVATE: + * r: (no) no + * w: (no) no + * x: (yes) yes */ pgprot_t protection_map[16] = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, @@ -111,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) void vma_set_page_prot(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; + pgprot_t vm_page_prot; - vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); - if (vma_wants_writenotify(vma)) { + vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; - vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, - vm_flags); + vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } + /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */ + WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* @@ -395,15 +402,9 @@ static inline void vma_rb_insert(struct vm_area_struct *vma, rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } -static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) { /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of the vma being erased. - */ - validate_mm_rb(root, vma); - - /* * Note rb_erase_augmented is a fairly large inline function, * so make sure we instantiate it only once with our desired * augmented rbtree callbacks. @@ -411,6 +412,32 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } +static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, + struct rb_root *root, + struct vm_area_struct *ignore) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the "next" vma being erased if + * next->vm_start was reduced. + */ + validate_mm_rb(root, ignore); + + __vma_rb_erase(vma, root); +} + +static __always_inline void vma_rb_erase(struct vm_area_struct *vma, + struct rb_root *root) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the vma being erased. + */ + validate_mm_rb(root, vma); + + __vma_rb_erase(vma, root); +} + /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. @@ -594,14 +621,25 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; } -static inline void -__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) +static __always_inline void __vma_unlink_common(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev, + bool has_prev, + struct vm_area_struct *ignore) { struct vm_area_struct *next; - vma_rb_erase(vma, &mm->mm_rb); - prev->vm_next = next = vma->vm_next; + vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); + next = vma->vm_next; + if (has_prev) + prev->vm_next = next; + else { + prev = vma->vm_prev; + if (prev) + prev->vm_next = next; + else + mm->mmap = next; + } if (next) next->vm_prev = prev; @@ -609,6 +647,13 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, vmacache_invalidate(mm); } +static inline void __vma_unlink_prev(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + __vma_unlink_common(mm, vma, prev, true, vma); +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -616,11 +661,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, + struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; @@ -636,9 +682,38 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * vma expands, overlapping all the next, and * perhaps the one after too (mprotect case 6). + * The only other cases that gets here are + * case 1, case 7 and case 8. */ - remove_next = 1 + (end > next->vm_end); - end = next->vm_end; + if (next == expand) { + /* + * The only case where we don't expand "vma" + * and we expand "next" instead is case 8. + */ + VM_WARN_ON(end != next->vm_end); + /* + * remove_next == 3 means we're + * removing "vma" and that to do so we + * swapped "vma" and "next". + */ + remove_next = 3; + VM_WARN_ON(file != next->vm_file); + swap(vma, next); + } else { + VM_WARN_ON(expand != vma); + /* + * case 1, 6, 7, remove_next == 2 is case 6, + * remove_next == 1 is case 1 or 7. + */ + remove_next = 1 + (end > next->vm_end); + VM_WARN_ON(remove_next == 2 && + end != next->vm_next->vm_end); + VM_WARN_ON(remove_next == 1 && + end != next->vm_end); + /* trim end to next, for case 6 first pass */ + end = next->vm_end; + } + exporter = next; importer = vma; @@ -646,7 +721,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, * If next doesn't have anon_vma, import from vma after * next, if the vma overlaps with it. */ - if (remove_next == 2 && next && !next->anon_vma) + if (remove_next == 2 && !next->anon_vma) exporter = next->vm_next; } else if (end > next->vm_start) { @@ -657,6 +732,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, adjust_next = (end - next->vm_start) >> PAGE_SHIFT; exporter = next; importer = vma; + VM_WARN_ON(expand != importer); } else if (end < vma->vm_end) { /* * vma shrinks, and !insert tells it's not @@ -666,6 +742,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); exporter = vma; importer = next; + VM_WARN_ON(expand != importer); } /* @@ -683,7 +760,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, } } again: - vma_adjust_trans_huge(vma, start, end, adjust_next); + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; @@ -709,8 +786,8 @@ again: if (!anon_vma && adjust_next) anon_vma = next->anon_vma; if (anon_vma) { - VM_BUG_ON_VMA(adjust_next && next->anon_vma && - anon_vma != next->anon_vma, next); + VM_WARN_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) @@ -750,7 +827,19 @@ again: * vma_merge has merged next into vma, and needs * us to remove next before dropping the locks. */ - __vma_unlink(mm, next, vma); + if (remove_next != 3) + __vma_unlink_prev(mm, next, vma); + else + /* + * vma is not before next if they've been + * swapped. + * + * pre-swap() next->vm_start was reduced so + * tell validate_mm_rb to ignore pre-swap() + * "next" (which is stored in post-swap() + * "vma"). + */ + __vma_unlink_common(mm, next, NULL, false, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { @@ -802,7 +891,27 @@ again: * we must remove another next too. It would clutter * up the code too much to do both in one go. */ - next = vma->vm_next; + if (remove_next != 3) { + /* + * If "next" was removed and vma->vm_end was + * expanded (up) over it, in turn + * "next->vm_prev->vm_end" changed and the + * "vma->vm_next" gap must be updated. + */ + next = vma->vm_next; + } else { + /* + * For the scope of the comment "next" and + * "vma" considered pre-swap(): if "vma" was + * removed, next->vm_start was expanded (down) + * over it and the "next" gap must be updated. + * Because of the swap() the post-swap() "vma" + * actually points to pre-swap() "next" + * (post-swap() "next" as opposed is now a + * dangling pointer). + */ + next = vma; + } if (remove_next == 2) { remove_next = 1; end = next->vm_end; @@ -810,8 +919,28 @@ again: } else if (next) vma_gap_update(next); - else - mm->highest_vm_end = end; + else { + /* + * If remove_next == 2 we obviously can't + * reach this path. + * + * If remove_next == 3 we can't reach this + * path because pre-swap() next is always not + * NULL. pre-swap() "next" is not being + * removed and its next->vm_end is not altered + * (and furthermore "end" already matches + * next->vm_end in remove_next == 3). + * + * We reach this only in the remove_next == 1 + * case if the "next" vma that was removed was + * the highest vma of the mm. However in such + * case next->vm_end == "end" and the extended + * "vma" has vma->vm_end == next->vm_end so + * mm->highest_vm_end doesn't need any update + * in remove_next == 1 case. + */ + VM_WARN_ON(mm->highest_vm_end != end); + } } if (insert && file) uprobe_mmap(insert); @@ -931,13 +1060,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * cannot merge might become might become might become * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPNNNNNNNN 8 + * mremap move: PPPPXXXXXXXX 8 * AAAA * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN * might become case 1 below case 2 below case 3 below * - * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: - * mprotect_fixup updates vm_flags & vm_page_prot on successful return. + * It is important for case 8 that the the vma NNNN overlapping the + * region AAAA is never going to extended over XXXX. Instead XXXX must + * be extended in region AAAA and NNNN must be removed. This way in + * all cases where vma_merge succeeds, the moment vma_adjust drops the + * rmap_locks, the properties of the merged vma will be already + * correct for the whole merged range. Some of those properties like + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must + * be correct for the whole merged range immediately after the + * rmap_locks are released. Otherwise if XXXX would be removed and + * NNNN would be extended over the XXXX range, remove_migration_ptes + * or other rmap walkers (if working on addresses beyond the "end" + * parameter) may establish ptes with the wrong permissions of NNNN + * instead of the right permissions of XXXX. */ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -962,9 +1102,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, else next = mm->mmap; area = next; - if (next && next->vm_end == end) /* cases 6, 7, 8 */ + if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; + /* verify some invariant that must be enforced by the caller */ + VM_WARN_ON(prev && addr <= prev->vm_start); + VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(addr >= end); + /* * Can it merge with the predecessor? */ @@ -985,11 +1130,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ - err = vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL); + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); } else /* cases 2, 5, 7 */ - err = vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL); + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); if (err) return NULL; khugepaged_enter_vma_merge(prev, vm_flags); @@ -1005,11 +1151,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ - err = vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL); - else /* cases 3, 8 */ - err = vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL); + err = __vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL, next); + else { /* cases 3, 8 */ + err = __vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + /* + * In case 3 area is already equal to next and + * this is a noop, but in case 8 "area" has + * been removed and next was expanded over it. + */ + area = next; + } if (err) return NULL; khugepaged_enter_vma_merge(area, vm_flags); @@ -1381,7 +1534,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * to the private version (using protection_map[] without the * VM_SHARED bit). */ -int vma_wants_writenotify(struct vm_area_struct *vma) +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { vm_flags_t vm_flags = vma->vm_flags; const struct vm_operations_struct *vm_ops = vma->vm_ops; @@ -1396,8 +1549,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* The open routine did something to the protections that pgprot_modify * won't preserve? */ - if (pgprot_val(vma->vm_page_prot) != - pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) + if (pgprot_val(vm_page_prot) != + pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; /* Do we need to track softdirty? */ @@ -3063,6 +3216,14 @@ out: return ERR_PTR(ret); } +bool vma_is_special_mapping(const struct vm_area_struct *vma, + const struct vm_special_mapping *sm) +{ + return vma->vm_private_data == sm && + (vma->vm_ops == &special_mapping_vmops || + vma->vm_ops == &legacy_special_mapping_vmops); +} + /* * Called with mm->mmap_sem held for writing. * Insert a new vma covering the given region, with the given flags. diff --git a/mm/mprotect.c b/mm/mprotect.c index a4830f0325fe..bcdbe62f3e6d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,11 +23,13 @@ #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/perf_event.h> +#include <linux/pkeys.h> #include <linux/ksm.h> #include <linux/pkeys.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> +#include <asm/mmu_context.h> #include <asm/tlbflush.h> #include "internal.h" @@ -304,6 +306,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, vma->vm_userfaultfd_ctx); if (*pprev) { vma = *pprev; + VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); goto success; } @@ -327,7 +330,7 @@ success: * held in write mode. */ vma->vm_flags = newflags; - dirty_accountable = vma_wants_writenotify(vma); + dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, @@ -352,8 +355,11 @@ fail: return error; } -SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, - unsigned long, prot) +/* + * pkey==-1 when doing a legacy mprotect() + */ +static int do_mprotect_pkey(unsigned long start, size_t len, + unsigned long prot, int pkey) { unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; @@ -382,6 +388,14 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, if (down_write_killable(¤t->mm->mmap_sem)) return -EINTR; + /* + * If userspace did not allocate the pkey, do not let + * them use it here. + */ + error = -EINVAL; + if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) + goto out; + vma = find_vma(current->mm, start); error = -ENOMEM; if (!vma) @@ -408,8 +422,9 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, prev = vma; for (nstart = start ; ; ) { + unsigned long mask_off_old_flags; unsigned long newflags; - int pkey = arch_override_mprotect_pkey(vma, prot, -1); + int new_vma_pkey; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ @@ -417,8 +432,17 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, if (rier && (vma->vm_flags & VM_MAYEXEC)) prot |= PROT_EXEC; - newflags = calc_vm_prot_bits(prot, pkey); - newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + /* + * Each mprotect() call explicitly passes r/w/x permissions. + * If a permission is not passed to mprotect(), it must be + * cleared from the VMA. + */ + mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | + ARCH_VM_PKEY_FLAGS; + + new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); + newflags = calc_vm_prot_bits(prot, new_vma_pkey); + newflags |= (vma->vm_flags & ~mask_off_old_flags); /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { @@ -454,3 +478,60 @@ out: up_write(¤t->mm->mmap_sem); return error; } + +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) +{ + return do_mprotect_pkey(start, len, prot, -1); +} + +SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, + unsigned long, prot, int, pkey) +{ + return do_mprotect_pkey(start, len, prot, pkey); +} + +SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) +{ + int pkey; + int ret; + + /* No flags supported yet. */ + if (flags) + return -EINVAL; + /* check for unsupported init values */ + if (init_val & ~PKEY_ACCESS_MASK) + return -EINVAL; + + down_write(¤t->mm->mmap_sem); + pkey = mm_pkey_alloc(current->mm); + + ret = -ENOSPC; + if (pkey == -1) + goto out; + + ret = arch_set_user_pkey_access(current, pkey, init_val); + if (ret) { + mm_pkey_free(current->mm, pkey); + goto out; + } + ret = pkey; +out: + up_write(¤t->mm->mmap_sem); + return ret; +} + +SYSCALL_DEFINE1(pkey_free, int, pkey) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + ret = mm_pkey_free(current->mm, pkey); + up_write(¤t->mm->mmap_sem); + + /* + * We could provie warnings or errors if any VMA still + * has the pkey set here. + */ + return ret; +} diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd05a70f44b9..ba609b684d7a 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -11,18 +11,21 @@ #include <linux/init.h> #include <linux/pfn.h> #include <linux/slab.h> -#include <linux/bootmem.h> #include <linux/export.h> #include <linux/kmemleak.h> #include <linux/range.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/bug.h> #include <asm/io.h> -#include <asm/processor.h> #include "internal.h" +#ifndef CONFIG_HAVE_MEMBLOCK +#error CONFIG_HAVE_MEMBLOCK not defined +#endif + #ifndef CONFIG_NEED_MULTIPLE_NODES struct pglist_data __refdata contig_page_data; EXPORT_SYMBOL(contig_page_data); @@ -134,6 +137,11 @@ static unsigned long __init free_low_memory_core_early(void) for_each_reserved_mem_region(i, &start, &end) reserve_bootmem_region(start, end); + /* + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + */ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) count += __free_memory_core(start, end); @@ -191,11 +199,6 @@ unsigned long __init free_all_bootmem(void) reset_all_zones_managed_pages(); - /* - * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesn't have RAM installed - * low ram will be on Node1 - */ pages = free_low_memory_core_early(); totalram_pages += pages; @@ -395,9 +398,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, return __alloc_bootmem_node(pgdat, size, align, goal); } -#ifndef ARCH_LOW_ADDRESS_LIMIT -#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL -#endif /** * __alloc_bootmem_low - allocate low boot memory diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d53a9aa00977..ec9f11d4f094 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -132,6 +132,11 @@ static inline bool is_sysrq_oom(struct oom_control *oc) return oc->order == -1; } +static inline bool is_memcg_oom(struct oom_control *oc) +{ + return oc->memcg != NULL; +} + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask) @@ -181,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_REAPED, &p->mm->flags) || + test_bit(MMF_OOM_SKIP, &p->mm->flags) || in_vfork(p)) { task_unlock(p); return 0; @@ -213,12 +218,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, return points > 0 ? points : 1; } +enum oom_constraint { + CONSTRAINT_NONE, + CONSTRAINT_CPUSET, + CONSTRAINT_MEMORY_POLICY, + CONSTRAINT_MEMCG, +}; + /* * Determine the type of allocation constraint. */ -#ifdef CONFIG_NUMA -static enum oom_constraint constrained_alloc(struct oom_control *oc, - unsigned long *totalpages) +static enum oom_constraint constrained_alloc(struct oom_control *oc) { struct zone *zone; struct zoneref *z; @@ -226,8 +236,16 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, bool cpuset_limited = false; int nid; + if (is_memcg_oom(oc)) { + oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + return CONSTRAINT_MEMCG; + } + /* Default to all available memory */ - *totalpages = totalram_pages + total_swap_pages; + oc->totalpages = totalram_pages + total_swap_pages; + + if (!IS_ENABLED(CONFIG_NUMA)) + return CONSTRAINT_NONE; if (!oc->zonelist) return CONSTRAINT_NONE; @@ -246,9 +264,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, */ if (oc->nodemask && !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { - *totalpages = total_swap_pages; + oc->totalpages = total_swap_pages; for_each_node_mask(nid, *oc->nodemask) - *totalpages += node_spanned_pages(nid); + oc->totalpages += node_spanned_pages(nid); return CONSTRAINT_MEMORY_POLICY; } @@ -259,98 +277,84 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, cpuset_limited = true; if (cpuset_limited) { - *totalpages = total_swap_pages; + oc->totalpages = total_swap_pages; for_each_node_mask(nid, cpuset_current_mems_allowed) - *totalpages += node_spanned_pages(nid); + oc->totalpages += node_spanned_pages(nid); return CONSTRAINT_CPUSET; } return CONSTRAINT_NONE; } -#else -static enum oom_constraint constrained_alloc(struct oom_control *oc, - unsigned long *totalpages) -{ - *totalpages = totalram_pages + total_swap_pages; - return CONSTRAINT_NONE; -} -#endif -enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, - struct task_struct *task) +static int oom_evaluate_task(struct task_struct *task, void *arg) { + struct oom_control *oc = arg; + unsigned long points; + if (oom_unkillable_task(task, NULL, oc->nodemask)) - return OOM_SCAN_CONTINUE; + goto next; /* * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves unless - * the task has MMF_OOM_REAPED because chances that it would release + * the task has MMF_OOM_SKIP because chances that it would release * any memory is quite low. */ - if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { - struct task_struct *p = find_lock_task_mm(task); - enum oom_scan_t ret = OOM_SCAN_ABORT; - - if (p) { - if (test_bit(MMF_OOM_REAPED, &p->mm->flags)) - ret = OOM_SCAN_CONTINUE; - task_unlock(p); - } - - return ret; + if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { + if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + goto next; + goto abort; } /* * If task is allocating a lot of memory and has been marked to be * killed first if it triggers an oom, then select it. */ - if (oom_task_origin(task)) - return OOM_SCAN_SELECT; + if (oom_task_origin(task)) { + points = ULONG_MAX; + goto select; + } - return OOM_SCAN_OK; + points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); + if (!points || points < oc->chosen_points) + goto next; + + /* Prefer thread group leaders for display purposes */ + if (points == oc->chosen_points && thread_group_leader(oc->chosen)) + goto next; +select: + if (oc->chosen) + put_task_struct(oc->chosen); + get_task_struct(task); + oc->chosen = task; + oc->chosen_points = points; +next: + return 0; +abort: + if (oc->chosen) + put_task_struct(oc->chosen); + oc->chosen = (void *)-1UL; + return 1; } /* - * Simple selection loop. We chose the process with the highest - * number of 'points'. Returns -1 on scan abort. + * Simple selection loop. We choose the process with the highest number of + * 'points'. In case scan was aborted, oc->chosen is set to -1. */ -static struct task_struct *select_bad_process(struct oom_control *oc, - unsigned int *ppoints, unsigned long totalpages) +static void select_bad_process(struct oom_control *oc) { - struct task_struct *p; - struct task_struct *chosen = NULL; - unsigned long chosen_points = 0; - - rcu_read_lock(); - for_each_process(p) { - unsigned int points; - - switch (oom_scan_process_thread(oc, p)) { - case OOM_SCAN_SELECT: - chosen = p; - chosen_points = ULONG_MAX; - /* fall through */ - case OOM_SCAN_CONTINUE: - continue; - case OOM_SCAN_ABORT: - rcu_read_unlock(); - return (struct task_struct *)(-1UL); - case OOM_SCAN_OK: - break; - }; - points = oom_badness(p, NULL, oc->nodemask, totalpages); - if (!points || points < chosen_points) - continue; + if (is_memcg_oom(oc)) + mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); + else { + struct task_struct *p; - chosen = p; - chosen_points = points; + rcu_read_lock(); + for_each_process(p) + if (oom_evaluate_task(p, oc)) + break; + rcu_read_unlock(); } - if (chosen) - get_task_struct(chosen); - rcu_read_unlock(); - *ppoints = chosen_points * 1000 / totalpages; - return chosen; + oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; } /** @@ -399,9 +403,14 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, + nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed; + + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, + nodemask_pr_args(nm), oc->order, current->signal->oom_score_adj); + if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) + pr_warn("COMPACTION is disabled!!!\n"); cpuset_print_current_mems_allowed(); dump_stack(); @@ -419,7 +428,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) static atomic_t oom_victims = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); -bool oom_killer_disabled __read_mostly; +static bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -452,12 +461,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -static bool __oom_reap_task(struct task_struct *tsk) +static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct mm_struct *mm = NULL; - struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; @@ -465,7 +472,7 @@ static bool __oom_reap_task(struct task_struct *tsk) /* * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: - * __oom_reap_task exit_mm + * __oom_reap_task_mm exit_mm * mmget_not_zero * mmput * atomic_dec_and_test @@ -478,22 +485,9 @@ static bool __oom_reap_task(struct task_struct *tsk) */ mutex_lock(&oom_lock); - /* - * Make sure we find the associated mm_struct even when the particular - * thread has already terminated and cleared its mm. - * We might have race with exit path so consider our work done if there - * is no mm. - */ - p = find_lock_task_mm(tsk); - if (!p) - goto unlock_oom; - mm = p->mm; - atomic_inc(&mm->mm_count); - task_unlock(p); - if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto mm_drop; + goto unlock_oom; } /* @@ -503,9 +497,17 @@ static bool __oom_reap_task(struct task_struct *tsk) */ if (!mmget_not_zero(mm)) { up_read(&mm->mmap_sem); - goto mm_drop; + goto unlock_oom; } + /* + * Tell all users of get_user/copy_from_user etc... that the content + * is no longer stable. No barriers really needed because unmapping + * should imply barriers already and the reader would hit a page fault + * if it stumbled over a reaped memory. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + tlb_gather_mmu(&tlb, mm, 0, -1); for (vma = mm->mmap ; vma; vma = vma->vm_next) { if (is_vm_hugetlb_page(vma)) @@ -541,18 +543,11 @@ static bool __oom_reap_task(struct task_struct *tsk) up_read(&mm->mmap_sem); /* - * This task can be safely ignored because we cannot do much more - * to release its memory. - */ - set_bit(MMF_OOM_REAPED, &mm->flags); - /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and * put the oom_reaper out of the way. */ mmput_async(mm); -mm_drop: - mmdrop(mm); unlock_oom: mutex_unlock(&oom_lock); return ret; @@ -562,44 +557,28 @@ unlock_oom: static void oom_reap_task(struct task_struct *tsk) { int attempts = 0; + struct mm_struct *mm = tsk->signal->oom_mm; /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) + while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); - if (attempts > MAX_OOM_REAP_RETRIES) { - struct task_struct *p; + if (attempts <= MAX_OOM_REAP_RETRIES) + goto done; - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); - /* - * If we've already tried to reap this task in the past and - * failed it probably doesn't make much sense to try yet again - * so hide the mm from the oom killer so that it can move on - * to another task with a different mm struct. - */ - p = find_lock_task_mm(tsk); - if (p) { - if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) { - pr_info("oom_reaper: giving up pid:%d (%s)\n", - task_pid_nr(tsk), tsk->comm); - set_bit(MMF_OOM_REAPED, &p->mm->flags); - } - task_unlock(p); - } + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + debug_show_all_locks(); - debug_show_all_locks(); - } +done: + tsk->oom_reaper_list = NULL; /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore or it is not a good candidate - * for the oom victim right now because it cannot release its memory - * itself nor by the oom reaper. + * Hide this mm from OOM killer because it has been either reaped or + * somebody can't call up_write(mmap_sem). */ - tsk->oom_reaper_list = NULL; - exit_oom_victim(tsk); + set_bit(MMF_OOM_SKIP, &mm->flags); /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); @@ -607,8 +586,6 @@ static void oom_reap_task(struct task_struct *tsk) static int oom_reaper(void *unused) { - set_freezable(); - while (true) { struct task_struct *tsk = NULL; @@ -627,7 +604,7 @@ static int oom_reaper(void *unused) return 0; } -void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct task_struct *tsk) { if (!oom_reaper_th) return; @@ -656,7 +633,11 @@ static int __init oom_init(void) return 0; } subsys_initcall(oom_init) -#endif +#else +static inline void wake_oom_reaper(struct task_struct *tsk) +{ +} +#endif /* CONFIG_MMU */ /** * mark_oom_victim - mark the given task as OOM victim @@ -664,14 +645,23 @@ subsys_initcall(oom_init) * * Has to be called with oom_lock held and never after * oom has been disabled already. + * + * tsk->mm has to be non NULL and caller has to guarantee it is stable (either + * under task_lock or operate on the current). */ -void mark_oom_victim(struct task_struct *tsk) +static void mark_oom_victim(struct task_struct *tsk) { + struct mm_struct *mm = tsk->mm; + WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; - atomic_inc(&tsk->signal->oom_victims); + + /* oom_mm is bound to the signal struct life time. */ + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) + atomic_inc(&tsk->signal->oom_mm->mm_count); + /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -685,21 +675,29 @@ void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(struct task_struct *tsk) +void exit_oom_victim(void) { - if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) - return; - atomic_dec(&tsk->signal->oom_victims); + clear_thread_flag(TIF_MEMDIE); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); } /** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + oom_killer_disabled = false; +} + +/** * oom_killer_disable - disable OOM killer + * @timeout: maximum timeout to wait for oom victims in jiffies * * Forces all page allocations to fail rather than trigger OOM killer. - * Will block and wait until all OOM victims are killed. + * Will block and wait until all OOM victims are killed or the given + * timeout expires. * * The function cannot be called when there are runnable user tasks because * the userspace would see unexpected allocation failures as a result. Any @@ -708,8 +706,10 @@ void exit_oom_victim(struct task_struct *tsk) * Returns true if successful and false if the OOM killer cannot be * disabled. */ -bool oom_killer_disable(void) +bool oom_killer_disable(signed long timeout) { + signed long ret; + /* * Make sure to not race with an ongoing OOM killer. Check that the * current is not killed (possibly due to sharing the victim's memory). @@ -719,19 +719,16 @@ bool oom_killer_disable(void) oom_killer_disabled = true; mutex_unlock(&oom_lock); - wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + ret = wait_event_interruptible_timeout(oom_victims_wait, + !atomic_read(&oom_victims), timeout); + if (ret <= 0) { + oom_killer_enable(); + return false; + } return true; } -/** - * oom_killer_enable - enable OOM killer - */ -void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} - static inline bool __task_will_free_mem(struct task_struct *task) { struct signal_struct *sig = task->signal; @@ -760,7 +757,7 @@ static inline bool __task_will_free_mem(struct task_struct *task) * Caller has to make sure that task->mm is stable (hold task_lock or * it operates on the current). */ -bool task_will_free_mem(struct task_struct *task) +static bool task_will_free_mem(struct task_struct *task) { struct mm_struct *mm = task->mm; struct task_struct *p; @@ -781,15 +778,16 @@ bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_REAPED, &mm->flags)) + if (test_bit(MMF_OOM_SKIP, &mm->flags)) return false; if (atomic_read(&mm->mm_users) <= 1) return true; /* - * This is really pessimistic but we do not have any reliable way - * to check that external processes share with our mm + * Make sure that all tasks which share the mm with the given tasks + * are dying as well to make sure that a) nobody pins its mm and + * b) the task is also reapable by the oom reaper. */ rcu_read_lock(); for_each_process(p) { @@ -806,14 +804,10 @@ bool task_will_free_mem(struct task_struct *task) return ret; } -/* - * Must be called while holding a reference to p, which will be released upon - * returning. - */ -void oom_kill_process(struct oom_control *oc, struct task_struct *p, - unsigned int points, unsigned long totalpages, - const char *message) +static void oom_kill_process(struct oom_control *oc, const char *message) { + struct task_struct *p = oc->chosen; + unsigned int points = oc->chosen_points; struct task_struct *victim = p; struct task_struct *child; struct task_struct *t; @@ -860,7 +854,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, * oom_badness() returns 0 if the thread is unkillable */ child_points = oom_badness(child, - oc->memcg, oc->nodemask, totalpages); + oc->memcg, oc->nodemask, oc->totalpages); if (child_points > victim_points) { put_task_struct(victim); victim = child; @@ -913,20 +907,20 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) { - /* - * We cannot use oom_reaper for the mm shared by this - * process because it wouldn't get killed and so the - * memory might be still used. Hide the mm from the oom - * killer to guarantee OOM forward progress. - */ + if (is_global_init(p)) { can_oom_reap = false; - set_bit(MMF_OOM_REAPED, &mm->flags); + set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); continue; } + /* + * No use_mm() user needs to read from the userspace so we are + * ok to reap it. + */ + if (unlikely(p->flags & PF_KTHREAD)) + continue; do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); @@ -942,7 +936,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) +static void check_panic_on_oom(struct oom_control *oc, + enum oom_constraint constraint) { if (likely(!sysctl_panic_on_oom)) return; @@ -988,19 +983,18 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); */ bool out_of_memory(struct oom_control *oc) { - struct task_struct *p; - unsigned long totalpages; unsigned long freed = 0; - unsigned int uninitialized_var(points); enum oom_constraint constraint = CONSTRAINT_NONE; if (oom_killer_disabled) return false; - blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) - /* Got some memory back in the last second. */ - return true; + if (!is_memcg_oom(oc)) { + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return true; + } /* * If current has a pending SIGKILL or is exiting, then automatically @@ -1024,37 +1018,38 @@ bool out_of_memory(struct oom_control *oc) /* * Check if there were limitations on the allocation (only relevant for - * NUMA) that may require different handling. + * NUMA and memcg) that may require different handling. */ - constraint = constrained_alloc(oc, &totalpages); + constraint = constrained_alloc(oc); if (constraint != CONSTRAINT_MEMORY_POLICY) oc->nodemask = NULL; check_panic_on_oom(oc, constraint); - if (sysctl_oom_kill_allocating_task && current->mm && - !oom_unkillable_task(current, NULL, oc->nodemask) && + if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && + current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oom_kill_process(oc, current, 0, totalpages, - "Out of memory (oom_kill_allocating_task)"); + oc->chosen = current; + oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); return true; } - p = select_bad_process(oc, &points, totalpages); + select_bad_process(oc); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p && !is_sysrq_oom(oc)) { + if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { dump_header(oc, NULL); panic("Out of memory and no killable processes...\n"); } - if (p && p != (void *)-1UL) { - oom_kill_process(oc, p, points, totalpages, "Out of memory"); + if (oc->chosen && oc->chosen != (void *)-1UL) { + oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : + "Memory cgroup out of memory"); /* * Give the killed process a good chance to exit before trying * to allocate memory again. */ schedule_timeout_killable(1); } - return true; + return !!oc->chosen; } /* @@ -1077,16 +1072,6 @@ void pagefault_out_of_memory(void) if (!mutex_trylock(&oom_lock)) return; - - if (!out_of_memory(&oc)) { - /* - * There shouldn't be any user tasks runnable while the - * OOM killer is disabled, so the current task has to - * be a racing OOM victim for which oom_killer_disable() - * is waiting for. - */ - WARN_ON(test_thread_flag(TIF_MEMDIE)); - } - + out_of_memory(&oc); mutex_unlock(&oom_lock); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f4cd7d8005c9..439cc63ad903 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1965,36 +1965,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) return false; } -void throttle_vm_writeout(gfp_t gfp_mask) -{ - unsigned long background_thresh; - unsigned long dirty_thresh; - - for ( ; ; ) { - global_dirty_limits(&background_thresh, &dirty_thresh); - dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh); - - /* - * Boost the allowable dirty threshold a bit for page - * allocators so they don't get DoS'ed by heavy writers - */ - dirty_thresh += dirty_thresh / 10; /* wheeee... */ - - if (global_node_page_state(NR_UNSTABLE_NFS) + - global_node_page_state(NR_WRITEBACK) <= dirty_thresh) - break; - congestion_wait(BLK_RW_ASYNC, HZ/10); - - /* - * The caller might hold locks which can prevent IO completion - * or progress in the filesystem. So we cannot just sit here - * waiting for IO to complete. - */ - if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) - break; - } -} - /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ @@ -2080,26 +2050,12 @@ void writeback_set_ratelimit(void) ratelimit_pages = 16; } -static int -ratelimit_handler(struct notifier_block *self, unsigned long action, - void *hcpu) +static int page_writeback_cpu_online(unsigned int cpu) { - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - case CPU_DEAD: - writeback_set_ratelimit(); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } + writeback_set_ratelimit(); + return 0; } -static struct notifier_block ratelimit_nb = { - .notifier_call = ratelimit_handler, - .next = NULL, -}; - /* * Called early on to tune the page writeback dirty limits. * @@ -2122,8 +2078,10 @@ void __init page_writeback_init(void) { BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); - writeback_set_ratelimit(); - register_cpu_notifier(&ratelimit_nb); + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", + page_writeback_cpu_online, NULL); + cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, + page_writeback_cpu_online); } /** @@ -2758,7 +2716,7 @@ int test_clear_page_writeback(struct page *page) int ret; lock_page_memcg(page); - if (mapping) { + if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; @@ -2801,7 +2759,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) int ret; lock_page_memcg(page); - if (mapping) { + if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2214c64ed3c..ca423cc20b59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -607,6 +607,9 @@ static bool need_debug_guardpage(void) if (!debug_pagealloc_enabled()) return false; + if (!debug_guardpage_minorder()) + return false; + return true; } @@ -615,6 +618,9 @@ static void init_debug_guardpage(void) if (!debug_pagealloc_enabled()) return; + if (!debug_guardpage_minorder()) + return; + _debug_guardpage_enabled = true; } @@ -635,19 +641,22 @@ static int __init debug_guardpage_minorder_setup(char *buf) pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } -__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); -static inline void set_page_guard(struct zone *zone, struct page *page, +static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { struct page_ext *page_ext; if (!debug_guardpage_enabled()) - return; + return false; + + if (order >= debug_guardpage_minorder()) + return false; page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) - return; + return false; __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); @@ -655,6 +664,8 @@ static inline void set_page_guard(struct zone *zone, struct page *page, set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -676,9 +687,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -struct page_ext_operations debug_guardpage_ops = { NULL, }; -static inline void set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} +struct page_ext_operations debug_guardpage_ops; +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif @@ -1393,15 +1404,18 @@ static void __init deferred_free_range(struct page *page, return; /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == MAX_ORDER_NR_PAGES && - (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + if (nr_pages == pageblock_nr_pages && + (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, MAX_ORDER-1); + __free_pages_boot_core(page, pageblock_order); return; } - for (i = 0; i < nr_pages; i++, page++) + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_boot_core(page, 0); + } } /* Completion tracking for deferred_init_memmap() threads */ @@ -1469,9 +1483,9 @@ static int __init deferred_init_memmap(void *data) /* * Ensure pfn_valid is checked every - * MAX_ORDER_NR_PAGES for memory holes + * pageblock_nr_pages for memory holes */ - if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) { if (!pfn_valid(pfn)) { page = NULL; goto free_range; @@ -1484,7 +1498,7 @@ static int __init deferred_init_memmap(void *data) } /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { page++; } else { nr_pages += nr_to_free; @@ -1520,6 +1534,9 @@ free_range: free_base_page = NULL; free_base_pfn = nr_to_free = 0; } + /* Free the last block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, nr_to_free); first_init_pfn = max(end_pfn, first_init_pfn); } @@ -1616,18 +1633,15 @@ static inline void expand(struct zone *zone, struct page *page, size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && - debug_guardpage_enabled() && - high < debug_guardpage_minorder()) { - /* - * Mark as guard pages (or page), that will allow to - * merge back to allocator when buddy will be freed. - * Corresponding page table entries will not be touched, - * pages will stay not present in virtual address space - */ - set_page_guard(zone, &page[size], high, migratetype); + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + if (set_page_guard(zone, &page[size], high, migratetype)) continue; - } + list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -2489,9 +2503,14 @@ int __isolate_free_page(struct page *page, unsigned int order) mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt)) { - /* Obey watermarks as if the page was being allocated */ - watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + /* + * Obey watermarks as if the page was being allocated. We can + * emulate a high-order watermark check with a raised order-0 + * watermark, because we already know our high-order page + * exists. + */ + watermark = min_wmark_pages(zone) + (1UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -2960,9 +2979,11 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) +void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; + struct va_format vaf; + va_list args; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) @@ -2980,22 +3001,16 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - if (fmt) { - struct va_format vaf; - va_list args; + pr_warn("%s: ", current->comm); - va_start(args, fmt); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_cont("%pV", &vaf); + va_end(args); - vaf.fmt = fmt; - vaf.va = &args; + pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); - pr_warn("%pV", &vaf); - - va_end(args); - } - - pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", - current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); @@ -3137,6 +3152,65 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } +static inline bool +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, + enum compact_result compact_result, + enum compact_priority *compact_priority, + int *compaction_retries) +{ + int max_retries = MAX_COMPACT_RETRIES; + int min_priority; + + if (!order) + return false; + + if (compaction_made_progress(compact_result)) + (*compaction_retries)++; + + /* + * compaction considers all the zone as desperately out of memory + * so it doesn't really make much sense to retry except when the + * failure could be caused by insufficient priority + */ + if (compaction_failed(compact_result)) + goto check_priority; + + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But do not retry if the given zonelist is not suitable for + * compaction. + */ + if (compaction_withdrawn(compact_result)) + return compaction_zonelist_suitable(ac, order, alloc_flags); + + /* + * !costly requests are much more important than __GFP_REPEAT + * costly ones because they are de facto nofail and invoke OOM + * killer to move on while costly can fail and users are ready + * to cope with that. 1/4 retries is rather arbitrary but we + * would need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; + if (*compaction_retries <= max_retries) + return true; + + /* + * Make sure there are attempts at the highest priority if we exhausted + * all retries or failed at the lower priorities. + */ +check_priority: + min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? + MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { + (*compact_priority)--; + *compaction_retries = 0; + return true; + } + return false; +} #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, @@ -3147,13 +3221,11 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } -#endif /* CONFIG_COMPACTION */ - static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, - int compaction_retries) + int *compaction_retries) { struct zone *zone; struct zoneref *z; @@ -3175,6 +3247,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } return false; } +#endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ static int @@ -3325,16 +3398,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) static inline bool should_reclaim_retry(gfp_t gfp_mask, unsigned order, struct alloc_context *ac, int alloc_flags, - bool did_some_progress, int no_progress_loops) + bool did_some_progress, int *no_progress_loops) { struct zone *zone; struct zoneref *z; /* + * Costly allocations might have made a progress but this doesn't mean + * their order will become available due to high fragmentation so + * always increment the no progress counter for them + */ + if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) + *no_progress_loops = 0; + else + (*no_progress_loops)++; + + /* * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (no_progress_loops > MAX_RECLAIM_RETRIES) + if (*no_progress_loops > MAX_RECLAIM_RETRIES) return false; /* @@ -3349,7 +3432,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, unsigned long reclaimable; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP(no_progress_loops * available, + available -= DIV_ROUND_UP((*no_progress_loops) * available, MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); @@ -3410,6 +3493,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, enum compact_result compact_result; int compaction_retries = 0; int no_progress_loops = 0; + unsigned long alloc_start = jiffies; + unsigned int stall_timeout = 10 * HZ; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3554,9 +3639,6 @@ retry: if (page) goto got_pg; - if (order && compaction_made_progress(compact_result)) - compaction_retries++; - /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) goto nopage; @@ -3568,18 +3650,16 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* - * Costly allocations might have made a progress but this doesn't mean - * their order will become available due to high fragmentation so - * always increment the no progress counter for them - */ - if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) - no_progress_loops = 0; - else - no_progress_loops++; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, + "page alloction stalls for %ums, order:%u\n", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; + } if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, - did_some_progress > 0, no_progress_loops)) + did_some_progress > 0, &no_progress_loops)) goto retry; /* @@ -3591,7 +3671,7 @@ retry: if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, - compaction_retries)) + &compaction_retries)) goto retry; /* Reclaim has failed us, start killing things */ @@ -3606,7 +3686,8 @@ retry: } nopage: - warn_alloc_failed(gfp_mask, order, NULL); + warn_alloc(gfp_mask, + "page allocation failure: order:%u", order); got_pg: return page; } @@ -4555,7 +4636,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; j = build_zonelists_node(NODE_DATA(node), zonelist, j); @@ -4571,7 +4652,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) int j; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[1]; + zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; @@ -4592,7 +4673,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) struct zone *z; struct zonelist *zonelist; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; pos = 0; for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { for (j = 0; j < nr_nodes; j++) { @@ -4727,7 +4808,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[0]; + zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; j = build_zonelists_node(pgdat, zonelist, 0); /* @@ -5000,15 +5081,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * If not mirrored_kernelcore and ZONE_MOVABLE exists, range - * from zone_movable_pfn[nid] to end of each node should be - * ZONE_MOVABLE not ZONE_NORMAL. skip it. - */ - if (!mirrored_kernelcore && zone_movable_pfn[nid]) - if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) - continue; - - /* * Check given memblock attribute by firmware which can affect * kernel memory layout. If zone==ZONE_MOVABLE but memory is * mirrored, it's an overlapped memmap init. skip it. @@ -5451,6 +5523,12 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (!mirrored_kernelcore && + *zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -5554,28 +5632,23 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages * and vice versa. */ - if (zone_movable_pfn[nid]) { - if (mirrored_kernelcore) { - unsigned long start_pfn, end_pfn; - struct memblock_region *r; - - for_each_memblock(memory, r) { - start_pfn = clamp(memblock_region_memory_base_pfn(r), - zone_start_pfn, zone_end_pfn); - end_pfn = clamp(memblock_region_memory_end_pfn(r), - zone_start_pfn, zone_end_pfn); - - if (zone_type == ZONE_MOVABLE && - memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - - if (zone_type == ZONE_NORMAL && - !memblock_is_mirror(r)) - nr_absent += end_pfn - start_pfn; - } - } else { - if (zone_type == ZONE_NORMAL) - nr_absent += node_end_pfn - zone_movable_pfn[nid]; + if (mirrored_kernelcore && zone_movable_pfn[nid]) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; } } @@ -6929,6 +7002,17 @@ static int __init set_hashdist(char *str) __setup("hashdist=", set_hashdist); #endif +#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES +/* + * Returns the number of pages that arch has reserved but + * is not known to alloc_large_system_hash(). + */ +static unsigned long __init arch_reserved_kernel_pages(void) +{ + return 0; +} +#endif + /* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 @@ -6953,6 +7037,7 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; + numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SHIFT < 20) diff --git a/mm/page_ext.c b/mm/page_ext.c index 44a4c029c8e7..121dcffc4ec1 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -42,6 +42,11 @@ * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * + * When need callback returns true, page_ext checks if there is a request for + * extra memory through size in struct page_ext_operations. If it is non-zero, + * extra space is allocated for each page_ext entry and offset is returned to + * user through offset in struct page_ext_operations. + * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime @@ -66,18 +71,24 @@ static struct page_ext_operations *page_ext_ops[] = { }; static unsigned long total_usage; +static unsigned long extra_mem; static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); + bool need = false; for (i = 0; i < entries; i++) { - if (page_ext_ops[i]->need && page_ext_ops[i]->need()) - return true; + if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { + page_ext_ops[i]->offset = sizeof(struct page_ext) + + extra_mem; + extra_mem += page_ext_ops[i]->size; + need = true; + } } - return false; + return need; } static void __init invoke_init_callbacks(void) @@ -91,6 +102,16 @@ static void __init invoke_init_callbacks(void) } } +static unsigned long get_entry_size(void) +{ + return sizeof(struct page_ext) + extra_mem; +} + +static inline struct page_ext *get_entry(void *base, unsigned long index) +{ + return base + get_entry_size() * index; +} + #if !defined(CONFIG_SPARSEMEM) @@ -102,7 +123,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); - unsigned long offset; + unsigned long index; struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; @@ -119,9 +140,9 @@ struct page_ext *lookup_page_ext(struct page *page) if (unlikely(!base)) return NULL; #endif - offset = pfn - round_down(node_start_pfn(page_to_nid(page)), + index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); - return base + offset; + return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) @@ -143,7 +164,7 @@ static int __init alloc_node_page_ext(int nid) !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; - table_size = sizeof(struct page_ext) * nr_pages; + table_size = get_entry_size() * nr_pages; base = memblock_virt_alloc_try_nid_nopanic( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), @@ -196,7 +217,7 @@ struct page_ext *lookup_page_ext(struct page *page) if (!section->page_ext) return NULL; #endif - return section->page_ext + pfn; + return get_entry(section->page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) @@ -229,7 +250,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) if (section->page_ext) return 0; - table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + table_size = get_entry_size() * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* @@ -249,7 +270,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid) * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; - section->page_ext = base - pfn; + section->page_ext = (void *)base - get_entry_size() * pfn; total_usage += table_size; return 0; } @@ -262,7 +283,7 @@ static void free_page_ext(void *addr) struct page *page = virt_to_page(addr); size_t table_size; - table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + table_size = get_entry_size() * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); free_pages_exact(addr, table_size); @@ -277,7 +298,7 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = ms->page_ext + pfn; + base = get_entry(ms->page_ext, pfn); free_page_ext(base); ms->page_ext = NULL; } diff --git a/mm/page_io.c b/mm/page_io.c index eafe5ddc2b54..a2651f58c86a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -264,7 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, int ret; struct swap_info_struct *sis = page_swap_info(page); - BUG_ON(!PageSwapCache(page)); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); if (sis->flags & SWP_FILE) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; @@ -338,7 +338,7 @@ int swap_readpage(struct page *page) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); - BUG_ON(!PageSwapCache(page)); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -388,7 +388,8 @@ int swap_set_page_dirty(struct page *page) if (sis->flags & SWP_FILE) { struct address_space *mapping = sis->swap_file->f_mapping; - BUG_ON(!PageSwapCache(page)); + + VM_BUG_ON_PAGE(!PageSwapCache(page), page); return mapping->a_ops->set_page_dirty(page); } else { return __set_page_dirty_no_writeback(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 064b7fb6e0b5..a5594bfcc5ed 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -55,7 +55,7 @@ static int set_migratetype_isolate(struct page *page, ret = 0; /* - * immobile means "not-on-lru" paes. If immobile is larger than + * immobile means "not-on-lru" pages. If immobile is larger than * removable-by-driver pages reported by notifier, we'll fail. */ diff --git a/mm/page_owner.c b/mm/page_owner.c index ec6dc1886f71..60634dc53a88 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -8,6 +8,7 @@ #include <linux/jump_label.h> #include <linux/migrate.h> #include <linux/stackdepot.h> +#include <linux/seq_file.h> #include "internal.h" @@ -17,6 +18,13 @@ */ #define PAGE_OWNER_STACK_DEPTH (16) +struct page_owner { + unsigned int order; + gfp_t gfp_mask; + int last_migrate_reason; + depot_stack_handle_t handle; +}; + static bool page_owner_disabled = true; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -85,10 +93,16 @@ static void init_page_owner(void) } struct page_ext_operations page_owner_ops = { + .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, }; +static inline struct page_owner *get_page_owner(struct page_ext *page_ext) +{ + return (void *)page_ext + page_owner_ops.offset; +} + void __reset_page_owner(struct page *page, unsigned int order) { int i; @@ -155,14 +169,16 @@ noinline void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; if (unlikely(!page_ext)) return; - page_ext->handle = save_stack(gfp_mask); - page_ext->order = order; - page_ext->gfp_mask = gfp_mask; - page_ext->last_migrate_reason = -1; + page_owner = get_page_owner(page_ext); + page_owner->handle = save_stack(gfp_mask); + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } @@ -170,21 +186,26 @@ noinline void __set_page_owner(struct page *page, unsigned int order, void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; + if (unlikely(!page_ext)) return; - page_ext->last_migrate_reason = reason; + page_owner = get_page_owner(page_ext); + page_owner->last_migrate_reason = reason; } void __split_page_owner(struct page *page, unsigned int order) { int i; struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; if (unlikely(!page_ext)) return; - page_ext->order = 0; + page_owner = get_page_owner(page_ext); + page_owner->order = 0; for (i = 1; i < (1 << order); i++) __copy_page_owner(page, page + i); } @@ -193,14 +214,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) { struct page_ext *old_ext = lookup_page_ext(oldpage); struct page_ext *new_ext = lookup_page_ext(newpage); + struct page_owner *old_page_owner, *new_page_owner; if (unlikely(!old_ext || !new_ext)) return; - new_ext->order = old_ext->order; - new_ext->gfp_mask = old_ext->gfp_mask; - new_ext->last_migrate_reason = old_ext->last_migrate_reason; - new_ext->handle = old_ext->handle; + old_page_owner = get_page_owner(old_ext); + new_page_owner = get_page_owner(new_ext); + new_page_owner->order = old_page_owner->order; + new_page_owner->gfp_mask = old_page_owner->gfp_mask; + new_page_owner->last_migrate_reason = + old_page_owner->last_migrate_reason; + new_page_owner->handle = old_page_owner->handle; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -214,9 +239,88 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) __set_bit(PAGE_EXT_OWNER, &new_ext->flags); } +void pagetypeinfo_showmixedcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + struct page_owner *page_owner; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count[MIGRATE_TYPES] = { 0, }; + int pageblock_mt, page_mt; + int i; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + pageblock_mt = get_pageblock_migratetype(page); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + continue; + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + page_owner = get_page_owner(page_ext); + page_mt = gfpflags_to_migratetype( + page_owner->gfp_mask); + if (pageblock_mt != page_mt) { + if (is_migrate_cma(pageblock_mt)) + count[MIGRATE_MOVABLE]++; + else + count[pageblock_mt]++; + + pfn = block_end_pfn; + break; + } + pfn += (1UL << page_owner->order) - 1; + } + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (i = 0; i < MIGRATE_TYPES; i++) + seq_printf(m, "%12lu ", count[i]); + seq_putc(m, '\n'); +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, - struct page *page, struct page_ext *page_ext, + struct page *page, struct page_owner *page_owner, depot_stack_handle_t handle) { int ret; @@ -236,15 +340,15 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, ret = snprintf(kbuf, count, "Page allocated via order %u, mask %#x(%pGg)\n", - page_ext->order, page_ext->gfp_mask, - &page_ext->gfp_mask); + page_owner->order, page_owner->gfp_mask, + &page_owner->gfp_mask); if (ret >= count) goto err; /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); - page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + page_mt = gfpflags_to_migratetype(page_owner->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, @@ -261,10 +365,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; - if (page_ext->last_migrate_reason != -1) { + if (page_owner->last_migrate_reason != -1) { ret += snprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", - migrate_reason_names[page_ext->last_migrate_reason]); + migrate_reason_names[page_owner->last_migrate_reason]); if (ret >= count) goto err; } @@ -287,6 +391,7 @@ err: void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); + struct page_owner *page_owner; unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { .nr_entries = 0, @@ -302,7 +407,9 @@ void __dump_page_owner(struct page *page) pr_alert("There is not page extension available.\n"); return; } - gfp_mask = page_ext->gfp_mask; + + page_owner = get_page_owner(page_ext); + gfp_mask = page_owner->gfp_mask; mt = gfpflags_to_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { @@ -310,7 +417,7 @@ void __dump_page_owner(struct page *page) return; } - handle = READ_ONCE(page_ext->handle); + handle = READ_ONCE(page_owner->handle); if (!handle) { pr_alert("page_owner info is not active (free page?)\n"); return; @@ -318,12 +425,12 @@ void __dump_page_owner(struct page *page) depot_fetch_stack(handle, &trace); pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); print_stack_trace(&trace, 0); - if (page_ext->last_migrate_reason != -1) + if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", - migrate_reason_names[page_ext->last_migrate_reason]); + migrate_reason_names[page_owner->last_migrate_reason]); } static ssize_t @@ -332,6 +439,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) unsigned long pfn; struct page *page; struct page_ext *page_ext; + struct page_owner *page_owner; depot_stack_handle_t handle; if (!static_branch_unlikely(&page_owner_inited)) @@ -381,11 +489,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + page_owner = get_page_owner(page_ext); + /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. */ - handle = READ_ONCE(page_ext->handle); + handle = READ_ONCE(page_owner->handle); if (!handle) continue; @@ -393,7 +503,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) *ppos = (pfn - min_low_pfn) + 1; return print_page_owner(buf, count, pfn, page, - page_ext, handle); + page_owner, handle); } return 0; diff --git a/mm/shmem.c b/mm/shmem.c index 6505c8b99f31..8596217b5e26 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -960,7 +960,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; @@ -2311,119 +2311,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } -static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct address_space *mapping = in->f_mapping; - struct inode *inode = mapping->host; - unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct page *page; - pgoff_t index, end_index; - loff_t isize, left; - int error, page_nr; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &page_cache_pipe_buf_ops, - .spd_release = spd_release_page, - }; - - isize = i_size_read(inode); - if (unlikely(*ppos >= isize)) - return 0; - - left = isize - *ppos; - if (unlikely(left < len)) - len = left; - - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; - - index = *ppos >> PAGE_SHIFT; - loff = *ppos & ~PAGE_MASK; - req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; - nr_pages = min(req_pages, spd.nr_pages_max); - - spd.nr_pages = find_get_pages_contig(mapping, index, - nr_pages, spd.pages); - index += spd.nr_pages; - error = 0; - - while (spd.nr_pages < nr_pages) { - error = shmem_getpage(inode, index, &page, SGP_CACHE); - if (error) - break; - unlock_page(page); - spd.pages[spd.nr_pages++] = page; - index++; - } - - index = *ppos >> PAGE_SHIFT; - nr_pages = spd.nr_pages; - spd.nr_pages = 0; - - for (page_nr = 0; page_nr < nr_pages; page_nr++) { - unsigned int this_len; - - if (!len) - break; - - this_len = min_t(unsigned long, len, PAGE_SIZE - loff); - page = spd.pages[page_nr]; - - if (!PageUptodate(page) || page->mapping != mapping) { - error = shmem_getpage(inode, index, &page, SGP_CACHE); - if (error) - break; - unlock_page(page); - put_page(spd.pages[page_nr]); - spd.pages[page_nr] = page; - } - - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_SHIFT; - if (unlikely(!isize || index > end_index)) - break; - - if (end_index == index) { - unsigned int plen; - - plen = ((isize - 1) & ~PAGE_MASK) + 1; - if (plen <= loff) - break; - - this_len = min(this_len, plen - loff); - len = this_len; - } - - spd.partial[page_nr].offset = loff; - spd.partial[page_nr].len = this_len; - len -= this_len; - loff = 0; - spd.nr_pages++; - index++; - } - - while (page_nr < nr_pages) - put_page(spd.pages[page_nr++]); - - if (spd.nr_pages) - error = splice_to_pipe(pipe, &spd); - - splice_shrink_spd(&spd); - - if (error > 0) { - *ppos += error; - file_accessed(in); - } - return error; -} - /* * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. */ @@ -3780,7 +3667,7 @@ static const struct file_operations shmem_file_operations = { .read_iter = shmem_file_read_iter, .write_iter = generic_file_write_iter, .fsync = noop_fsync, - .splice_read = shmem_file_splice_read, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif @@ -4063,7 +3950,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); /* common code */ -static struct dentry_operations anon_ops = { +static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; diff --git a/mm/slab.c b/mm/slab.c index b67271024135..090fb26b3a39 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -886,6 +886,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) return 0; } +#if (defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)) || defined(CONFIG_SMP) /* * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node @@ -908,6 +909,7 @@ static int init_cache_node_node(int node) return 0; } +#endif static int setup_kmem_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp, bool force_change) @@ -975,6 +977,8 @@ fail: return ret; } +#ifdef CONFIG_SMP + static void cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -1075,65 +1079,54 @@ bad: return -ENOMEM; } -static int cpuup_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +int slab_prepare_cpu(unsigned int cpu) { - long cpu = (long)hcpu; - int err = 0; + int err; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - mutex_lock(&slab_mutex); - err = cpuup_prepare(cpu); - mutex_unlock(&slab_mutex); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - start_cpu_timer(cpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* - * Shutdown cache reaper. Note that the slab_mutex is - * held so that if cache_reap() is invoked it cannot do - * anything expensive but will only modify reap_work - * and reschedule the timer. - */ - cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); - /* Now the cache_reaper is guaranteed to be not running. */ - per_cpu(slab_reap_work, cpu).work.func = NULL; - break; - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - start_cpu_timer(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* - * Even if all the cpus of a node are down, we don't free the - * kmem_cache_node of any cache. This to avoid a race between - * cpu_down, and a kmalloc allocation from another cpu for - * memory from the node of the cpu going down. The node - * structure is usually allocated from kmem_cache_create() and - * gets destroyed at kmem_cache_destroy(). - */ - /* fall through */ + mutex_lock(&slab_mutex); + err = cpuup_prepare(cpu); + mutex_unlock(&slab_mutex); + return err; +} + +/* + * This is called for a failed online attempt and for a successful + * offline. + * + * Even if all the cpus of a node are down, we don't free the + * kmem_list3 of any cache. This to avoid a race between cpu_down, and + * a kmalloc allocation from another cpu for memory from the node of + * the cpu going down. The list3 structure is usually allocated from + * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). + */ +int slab_dead_cpu(unsigned int cpu) +{ + mutex_lock(&slab_mutex); + cpuup_canceled(cpu); + mutex_unlock(&slab_mutex); + return 0; +} #endif - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - mutex_lock(&slab_mutex); - cpuup_canceled(cpu); - mutex_unlock(&slab_mutex); - break; - } - return notifier_from_errno(err); + +static int slab_online_cpu(unsigned int cpu) +{ + start_cpu_timer(cpu); + return 0; } -static struct notifier_block cpucache_notifier = { - &cpuup_callback, NULL, 0 -}; +static int slab_offline_cpu(unsigned int cpu) +{ + /* + * Shutdown cache reaper. Note that the slab_mutex is held so + * that if cache_reap() is invoked it cannot do anything + * expensive but will only modify reap_work and reschedule the + * timer. + */ + cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); + /* Now the cache_reaper is guaranteed to be not running. */ + per_cpu(slab_reap_work, cpu).work.func = NULL; + return 0; +} #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) /* @@ -1336,12 +1329,6 @@ void __init kmem_cache_init_late(void) /* Done! */ slab_state = FULL; - /* - * Register a cpu startup notifier callback that initializes - * cpu_cache_get for all new cpus - */ - register_cpu_notifier(&cpucache_notifier); - #ifdef CONFIG_NUMA /* * Register a memory hotplug callback that initializes and frees @@ -1358,13 +1345,14 @@ void __init kmem_cache_init_late(void) static int __init cpucache_init(void) { - int cpu; + int ret; /* * Register the timers that return unneeded pages to the page allocator */ - for_each_online_cpu(cpu) - start_cpu_timer(cpu); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online", + slab_online_cpu, slab_offline_cpu); + WARN_ON(ret < 0); /* Done! */ slab_state = FULL; diff --git a/mm/slub.c b/mm/slub.c index 9adae58462f8..2b3e740609e9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -194,10 +194,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define __OBJECT_POISON 0x80000000UL /* Poison object */ #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ -#ifdef CONFIG_SMP -static struct notifier_block slab_notifier; -#endif - /* * Tracking user of a slab. */ @@ -2305,6 +2301,25 @@ static void flush_all(struct kmem_cache *s) } /* + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. + */ +static int slub_cpu_dead(unsigned int cpu) +{ + struct kmem_cache *s; + unsigned long flags; + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + local_irq_save(flags); + __flush_cpu_slab(s, cpu); + local_irq_restore(flags); + } + mutex_unlock(&slab_mutex); + return 0; +} + +/* * Check if the objects in a per cpu structure fit numa * locality expectations. */ @@ -4144,9 +4159,8 @@ void __init kmem_cache_init(void) /* Setup random freelists for each cache */ init_freelist_randomization(); -#ifdef CONFIG_SMP - register_cpu_notifier(&slab_notifier); -#endif + cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, + slub_cpu_dead); pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", cache_line_size(), @@ -4210,43 +4224,6 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) return err; } -#ifdef CONFIG_SMP -/* - * Use the cpu notifier to insure that the cpu slabs are flushed when - * necessary. - */ -static int slab_cpuup_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct kmem_cache *s; - unsigned long flags; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - local_irq_save(flags); - __flush_cpu_slab(s, cpu); - local_irq_restore(flags); - } - mutex_unlock(&slab_mutex); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block slab_notifier = { - .notifier_call = slab_cpuup_callback -}; - -#endif - void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; diff --git a/mm/swap.c b/mm/swap.c index 75c63bb2a1da..4dcf852e1e6d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold) locked_pgdat = NULL; } - if (is_huge_zero_page(page)) { - put_huge_zero_page(); + if (is_huge_zero_page(page)) continue; - } page = compound_head(page); if (!put_page_testzero(page)) diff --git a/mm/swap_state.c b/mm/swap_state.c index c8310a37be3a..35d7e0ee1c77 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,6 +37,8 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, + /* swap cache doesn't use writeback related tags */ + .flags = 1 << AS_NO_WRITEBACK_TAGS, } }; @@ -92,7 +94,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); error = radix_tree_insert(&address_space->page_tree, - entry.val, page); + swp_offset(entry), page); if (likely(!error)) { address_space->nrpages++; __inc_node_page_state(page, NR_FILE_PAGES); @@ -143,7 +145,7 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, page_private(page)); + radix_tree_delete(&address_space->page_tree, swp_offset(entry)); set_page_private(page, 0); ClearPageSwapCache(page); address_space->nrpages--; @@ -252,9 +254,7 @@ static inline void free_swap_cache(struct page *page) void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - if (is_huge_zero_page(page)) - put_huge_zero_page(); - else + if (!is_huge_zero_page(page)) put_page(page); } @@ -283,7 +283,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(swap_address_space(entry), entry.val); + page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page) { INC_CACHE_INFO(find_success); @@ -310,7 +310,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, entry.val); + found_page = find_get_page(swapper_space, swp_offset(entry)); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 2657accc6e2b..2210de290b54 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -105,7 +105,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(swap_address_space(entry), entry.val); + page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (!page) return 0; /* @@ -257,6 +257,53 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline bool cluster_list_empty(struct swap_cluster_list *list) +{ + return cluster_is_null(&list->head); +} + +static inline unsigned int cluster_list_first(struct swap_cluster_list *list) +{ + return cluster_next(&list->head); +} + +static void cluster_list_init(struct swap_cluster_list *list) +{ + cluster_set_null(&list->head); + cluster_set_null(&list->tail); +} + +static void cluster_list_add_tail(struct swap_cluster_list *list, + struct swap_cluster_info *ci, + unsigned int idx) +{ + if (cluster_list_empty(list)) { + cluster_set_next_flag(&list->head, idx, 0); + cluster_set_next_flag(&list->tail, idx, 0); + } else { + unsigned int tail = cluster_next(&list->tail); + + cluster_set_next(&ci[tail], idx); + cluster_set_next_flag(&list->tail, idx, 0); + } +} + +static unsigned int cluster_list_del_first(struct swap_cluster_list *list, + struct swap_cluster_info *ci) +{ + unsigned int idx; + + idx = cluster_next(&list->head); + if (cluster_next(&list->tail) == idx) { + cluster_set_null(&list->head); + cluster_set_null(&list->tail); + } else + cluster_set_next_flag(&list->head, + cluster_next(&ci[idx]), 0); + + return idx; +} + /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, unsigned int idx) @@ -270,17 +317,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); - if (cluster_is_null(&si->discard_cluster_head)) { - cluster_set_next_flag(&si->discard_cluster_head, - idx, 0); - cluster_set_next_flag(&si->discard_cluster_tail, - idx, 0); - } else { - unsigned int tail = cluster_next(&si->discard_cluster_tail); - cluster_set_next(&si->cluster_info[tail], idx); - cluster_set_next_flag(&si->discard_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); schedule_work(&si->discard_work); } @@ -296,15 +333,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) info = si->cluster_info; - while (!cluster_is_null(&si->discard_cluster_head)) { - idx = cluster_next(&si->discard_cluster_head); - - cluster_set_next_flag(&si->discard_cluster_head, - cluster_next(&info[idx]), 0); - if (cluster_next(&si->discard_cluster_tail) == idx) { - cluster_set_null(&si->discard_cluster_head); - cluster_set_null(&si->discard_cluster_tail); - } + while (!cluster_list_empty(&si->discard_clusters)) { + idx = cluster_list_del_first(&si->discard_clusters, info); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, @@ -312,19 +342,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) spin_lock(&si->lock); cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&si->free_cluster_head)) { - cluster_set_next_flag(&si->free_cluster_head, - idx, 0); - cluster_set_next_flag(&si->free_cluster_tail, - idx, 0); - } else { - unsigned int tail; - - tail = cluster_next(&si->free_cluster_tail); - cluster_set_next(&info[tail], idx); - cluster_set_next_flag(&si->free_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&si->free_clusters, info, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); } @@ -353,13 +371,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (!cluster_info) return; if (cluster_is_free(&cluster_info[idx])) { - VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); - cluster_set_next_flag(&p->free_cluster_head, - cluster_next(&cluster_info[idx]), 0); - if (cluster_next(&p->free_cluster_tail) == idx) { - cluster_set_null(&p->free_cluster_tail); - cluster_set_null(&p->free_cluster_head); - } + VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); + cluster_list_del_first(&p->free_clusters, cluster_info); cluster_set_count_flag(&cluster_info[idx], 0, 0); } @@ -398,14 +411,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, } cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&p->free_cluster_head)) { - cluster_set_next_flag(&p->free_cluster_head, idx, 0); - cluster_set_next_flag(&p->free_cluster_tail, idx, 0); - } else { - unsigned int tail = cluster_next(&p->free_cluster_tail); - cluster_set_next(&cluster_info[tail], idx); - cluster_set_next_flag(&p->free_cluster_tail, idx, 0); - } + cluster_list_add_tail(&p->free_clusters, cluster_info, idx); } } @@ -421,8 +427,8 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, bool conflict; offset /= SWAPFILE_CLUSTER; - conflict = !cluster_is_null(&si->free_cluster_head) && - offset != cluster_next(&si->free_cluster_head) && + conflict = !cluster_list_empty(&si->free_clusters) && + offset != cluster_list_first(&si->free_clusters) && cluster_is_free(&si->cluster_info[offset]); if (!conflict) @@ -447,11 +453,11 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, new_cluster: cluster = this_cpu_ptr(si->percpu_cluster); if (cluster_is_null(&cluster->index)) { - if (!cluster_is_null(&si->free_cluster_head)) { - cluster->index = si->free_cluster_head; + if (!cluster_list_empty(&si->free_clusters)) { + cluster->index = si->free_clusters.head; cluster->next = cluster_next(&cluster->index) * SWAPFILE_CLUSTER; - } else if (!cluster_is_null(&si->discard_cluster_head)) { + } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in * discarding, do discard now and reclaim them @@ -999,7 +1005,7 @@ int free_swap_and_cache(swp_entry_t entry) if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { page = find_get_page(swap_address_space(entry), - entry.val); + swp_offset(entry)); if (page && !trylock_page(page)) { put_page(page); page = NULL; @@ -2292,10 +2298,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, nr_good_pages = maxpages - 1; /* omit header page */ - cluster_set_null(&p->free_cluster_head); - cluster_set_null(&p->free_cluster_tail); - cluster_set_null(&p->discard_cluster_head); - cluster_set_null(&p->discard_cluster_tail); + cluster_list_init(&p->free_clusters); + cluster_list_init(&p->discard_clusters); for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; @@ -2341,19 +2345,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, for (i = 0; i < nr_clusters; i++) { if (!cluster_count(&cluster_info[idx])) { cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - if (cluster_is_null(&p->free_cluster_head)) { - cluster_set_next_flag(&p->free_cluster_head, - idx, 0); - cluster_set_next_flag(&p->free_cluster_tail, - idx, 0); - } else { - unsigned int tail; - - tail = cluster_next(&p->free_cluster_tail); - cluster_set_next(&cluster_info[tail], idx); - cluster_set_next_flag(&p->free_cluster_tail, - idx, 0); - } + cluster_list_add_tail(&p->free_clusters, cluster_info, + idx); } idx++; if (idx == nr_clusters) diff --git a/mm/vmacache.c b/mm/vmacache.c index fd09dc9c6812..035fdeb35b43 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -87,11 +87,11 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) { int i; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + if (!vmacache_valid(mm)) return NULL; - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; @@ -115,11 +115,11 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, { int i; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + if (!vmacache_valid(mm)) return NULL; - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 91f44e78c516..f2481cb4e6b2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1359,14 +1359,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *area; BUG_ON(in_interrupt()); - if (flags & VM_IOREMAP) - align = 1ul << clamp_t(int, fls_long(size), - PAGE_SHIFT, IOREMAP_MAX_ORDER); - size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; + if (flags & VM_IOREMAP) + align = 1ul << clamp_t(int, get_count_order_long(size), + PAGE_SHIFT, IOREMAP_MAX_ORDER); + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; @@ -1601,7 +1601,6 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { - const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; @@ -1629,9 +1628,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_pages(alloc_mask, order); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, alloc_mask, order); + page = alloc_pages_node(node, alloc_mask, 0); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1648,8 +1647,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc_failed(gfp_mask, order, - "vmalloc: allocation failure, allocated %ld of %ld bytes\n", + warn_alloc(gfp_mask, + "vmalloc: allocation failure, allocated %ld of %ld bytes", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; @@ -1710,9 +1709,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc_failed(gfp_mask, 0, - "vmalloc: allocation failure: %lu bytes\n", - real_size); + warn_alloc(gfp_mask, + "vmalloc: allocation failure: %lu bytes", real_size); return NULL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 0fe8b7113868..744f926af442 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2418,8 +2418,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); - - throttle_vm_writeout(sc->gfp_mask); } /* Use reclaim/compaction for costly allocs or under memory pressure */ @@ -2480,7 +2478,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, * If we have not reclaimed enough pages for compaction and the * inactive lists are large enough, continue reclaiming */ - pages_for_compaction = (2UL << sc->order); + pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (get_nr_swap_pages() > 0) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); @@ -2495,7 +2493,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, continue; switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { - case COMPACT_PARTIAL: + case COMPACT_SUCCESS: case COMPACT_CONTINUE: return false; default: @@ -2598,38 +2596,35 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } /* - * Returns true if compaction should go ahead for a high-order request, or - * the high-order allocation would succeed without compaction. + * Returns true if compaction should go ahead for a costly-order request, or + * the allocation would already succeed without compaction. Return false if we + * should reclaim first. */ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; - bool watermark_ok; + enum compact_result suitable; - /* - * Compaction takes time to run and there are potentially other - * callers using the pages just freed. Continue reclaiming until - * there is a buffer of free pages available to give compaction - * a reasonable chance of completing and allocating the page - */ - watermark = high_wmark_pages(zone) + (2UL << sc->order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); - - /* - * If compaction is deferred, reclaim up to a point where - * compaction will have a chance of success when re-enabled - */ - if (compaction_deferred(zone, sc->order)) - return watermark_ok; + suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); + if (suitable == COMPACT_SUCCESS) + /* Allocation should succeed already. Don't reclaim. */ + return true; + if (suitable == COMPACT_SKIPPED) + /* Compaction cannot yet proceed. Do reclaim. */ + return false; /* - * If compaction is not ready to start and allocation is not likely - * to succeed without it, then keep reclaiming. + * Compaction is already possible, but it takes time to run and there + * are potentially other callers using the pages just freed. So proceed + * with reclaim to make a buffer of free pages available to give + * compaction a reasonable chance of completing and allocating the page. + * Note that we won't actually reclaim the whole buffer in one attempt + * as the target watermark in should_continue_reclaim() is lower. But if + * we are already above the high+gap watermark, don't reclaim at all. */ - if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) - return false; + watermark = high_wmark_pages(zone) + compact_gap(sc->order); - return watermark_ok; + return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); } /* @@ -3041,7 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, */ nid = mem_cgroup_select_victim_node(memcg); - zonelist = NODE_DATA(nid)->node_zonelists; + zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, @@ -3169,7 +3164,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * excessive reclaim. Assume that a process requested a high-order * can direct reclaim/compact. */ - if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) + if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) sc->order = 0; return sc->nr_scanned >= sc->nr_to_reclaim; diff --git a/mm/vmstat.c b/mm/vmstat.c index 89cec42d19ff..604f26a4f696 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1254,85 +1254,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) return 0; } -#ifdef CONFIG_PAGE_OWNER -static void pagetypeinfo_showmixedcount_print(struct seq_file *m, - pg_data_t *pgdat, - struct zone *zone) -{ - struct page *page; - struct page_ext *page_ext; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; - unsigned long count[MIGRATE_TYPES] = { 0, }; - int pageblock_mt, page_mt; - int i; - - /* Scan block by block. First and last block may be incomplete */ - pfn = zone->zone_start_pfn; - - /* - * Walk the zone in pageblock_nr_pages steps. If a page block spans - * a zone boundary, it will be double counted between zones. This does - * not matter as the mixed block count will still be correct - */ - for (; pfn < end_pfn; ) { - if (!pfn_valid(pfn)) { - pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); - continue; - } - - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - block_end_pfn = min(block_end_pfn, end_pfn); - - page = pfn_to_page(pfn); - pageblock_mt = get_pageblock_migratetype(page); - - for (; pfn < block_end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - continue; - - page = pfn_to_page(pfn); - - if (page_zone(page) != zone) - continue; - - if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; - continue; - } - - if (PageReserved(page)) - continue; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - continue; - - if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; - - page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); - if (pageblock_mt != page_mt) { - if (is_migrate_cma(pageblock_mt)) - count[MIGRATE_MOVABLE]++; - else - count[pageblock_mt]++; - - pfn = block_end_pfn; - break; - } - pfn += (1UL << page_ext->order) - 1; - } - } - - /* Print counts */ - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (i = 0; i < MIGRATE_TYPES; i++) - seq_printf(m, "%12lu ", count[i]); - seq_putc(m, '\n'); -} -#endif /* CONFIG_PAGE_OWNER */ - /* * Print out the number of pageblocks for each migratetype that contain pages * of other types. This gives an indication of how well fallbacks are being @@ -1592,7 +1513,10 @@ static int vmstat_show(struct seq_file *m, void *arg) { unsigned long *l = arg; unsigned long off = l - (unsigned long *)m->private; - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + + seq_puts(m, vmstat_text[off]); + seq_put_decimal_ull(m, " ", *l); + seq_putc(m, '\n'); return 0; } @@ -1794,6 +1718,16 @@ static void __init start_shepherd_timer(void) round_jiffies_relative(sysctl_stat_interval)); } +static void __init init_cpu_node_state(void) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + node_set_state(cpu_to_node(cpu), N_CPU); + put_online_cpus(); +} + static void vmstat_cpu_dead(int node) { int cpu; @@ -1851,6 +1785,7 @@ static int __init setup_vmstat(void) #ifdef CONFIG_SMP cpu_notifier_register_begin(); __register_cpu_notifier(&vmstat_notifier); + init_cpu_node_state(); start_shepherd_timer(); cpu_notifier_register_done(); diff --git a/mm/workingset.c b/mm/workingset.c index 69551cfae97b..617475f529f4 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -418,21 +418,19 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - - BUG_ON(!node->count); - BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + BUG_ON(!workingset_node_shadows(node)); + BUG_ON(workingset_node_pages(node)); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); node->slots[i] = NULL; - BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); - node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + workingset_node_shadows_dec(node); BUG_ON(!mapping->nrexceptional); mapping->nrexceptional--; } } - BUG_ON(node->count); + BUG_ON(workingset_node_shadows(node)); inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); if (!__radix_tree_delete_node(&mapping->page_tree, node)) BUG(); |