diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 523 |
1 files changed, 350 insertions, 173 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 81e18ceef579..905db9d7962f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -204,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order); * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA - 256, + [ZONE_DMA] = 256, #endif #ifdef CONFIG_ZONE_DMA32 - 256, + [ZONE_DMA32] = 256, #endif + [ZONE_NORMAL] = 32, #ifdef CONFIG_HIGHMEM - 32, + [ZONE_HIGHMEM] = 0, #endif - 32, + [ZONE_MOVABLE] = 0, }; EXPORT_SYMBOL(totalram_pages); @@ -264,17 +265,19 @@ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; int watermark_scale_factor = 10; -static unsigned long __meminitdata nr_kernel_pages; -static unsigned long __meminitdata nr_all_pages; -static unsigned long __meminitdata dma_reserve; +static unsigned long nr_kernel_pages __meminitdata; +static unsigned long nr_all_pages __meminitdata; +static unsigned long dma_reserve __meminitdata; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __initdata required_kernelcore; -static unsigned long __initdata required_movablecore; -static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; -static bool mirrored_kernelcore; +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long required_kernelcore __initdata; +static unsigned long required_kernelcore_percent __initdata; +static unsigned long required_movablecore __initdata; +static unsigned long required_movablecore_percent __initdata; +static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; +static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -291,40 +294,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - -/* - * Determine how many pages need to be initialized during early boot - * (non-deferred initialization). - * The value of first_deferred_pfn will be set later, once non-deferred pages - * are initialized, but for now set it ULONG_MAX. - */ -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ - phys_addr_t start_addr, end_addr; - unsigned long max_pgcnt; - unsigned long reserved; - - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_pgcnt = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - - /* - * Compensate the all the memblock reservations (e.g. crash kernel) - * from the initial estimation to make sure we will initialize enough - * memory to boot. - */ - start_addr = PFN_PHYS(pgdat->node_start_pfn); - end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); - reserved = memblock_reserved_memory_within(start_addr, end_addr); - max_pgcnt += PHYS_PFN(reserved); - - pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -} - /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { @@ -357,10 +326,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, return true; } #else -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ -} - static inline bool early_page_uninitialised(unsigned long pfn) { return false; @@ -1095,6 +1060,15 @@ static bool bulkfree_pcp_prepare(struct page *page) } #endif /* CONFIG_DEBUG_VM */ +static inline void prefetch_buddy(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); + struct page *buddy = page + (buddy_pfn - pfn); + + prefetch(buddy); +} + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -1111,13 +1085,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int prefetch_nr = 0; bool isolated_pageblocks; - - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); + struct page *page, *tmp; + LIST_HEAD(head); while (count) { - struct page *page; struct list_head *list; /* @@ -1139,26 +1112,48 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = count; do { - int mt; /* migratetype of the to-be-freed page */ - page = list_last_entry(list, struct page, lru); - /* must delete as __free_one_page list manipulates */ + /* must delete to avoid corrupting pcp list */ list_del(&page->lru); - - mt = get_pcppage_migratetype(page); - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); + pcp->count--; if (bulkfree_pcp_prepare(page)) continue; - __free_one_page(page, page_to_pfn(page), zone, 0, mt); - trace_mm_page_pcpu_drain(page, 0, mt); + list_add_tail(&page->lru, &head); + + /* + * We are going to put the page back to the global + * pool, prefetch its buddy to speed up later access + * under zone->lock. It is believed the overhead of + * an additional test and calculating buddy_pfn here + * can be offset by reduced memory latency later. To + * avoid excessive prefetching due to large count, only + * prefetch buddy for the first pcp->batch nr of pages. + */ + if (prefetch_nr++ < pcp->batch) + prefetch_buddy(page); } while (--count && --batch_free && !list_empty(list)); } + + spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); + + /* + * Use safe version since after __free_one_page(), + * page->lru.next will not point to original list. + */ + list_for_each_entry_safe(page, tmp, &head, lru) { + int mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); + + __free_one_page(page, page_to_pfn(page), zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + } spin_unlock(&zone->lock); } @@ -1177,10 +1172,9 @@ static void free_one_page(struct zone *zone, } static void __meminit __init_single_page(struct page *page, unsigned long pfn, - unsigned long zone, int nid, bool zero) + unsigned long zone, int nid) { - if (zero) - mm_zero_struct_page(page); + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1194,12 +1188,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, #endif } -static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, - int nid, bool zero) -{ - return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero); -} - #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __meminit init_reserved_page(unsigned long pfn) { @@ -1218,7 +1206,7 @@ static void __meminit init_reserved_page(unsigned long pfn) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) break; } - __init_single_pfn(pfn, zid, nid, true); + __init_single_page(pfn_to_page(pfn), pfn, zid, nid); } #else static inline void init_reserved_page(unsigned long pfn) @@ -1502,7 +1490,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - cond_resched(); + touch_nmi_watchdog(); } else { nr_free++; } @@ -1531,11 +1519,11 @@ static unsigned long __init deferred_init_pages(int nid, int zid, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - cond_resched(); + touch_nmi_watchdog(); } else { page++; } - __init_single_page(page, pfn, zid, nid, true); + __init_single_page(page, pfn, zid, nid); nr_pages++; } return (nr_pages); @@ -1548,23 +1536,25 @@ static int __init deferred_init_memmap(void *data) int nid = pgdat->node_id; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long spfn, epfn; + unsigned long spfn, epfn, first_init_pfn, flags; phys_addr_t spa, epa; int zid; struct zone *zone; - unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); u64 i; + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + pgdat_resize_lock(pgdat, &flags); + first_init_pfn = pgdat->first_deferred_pfn; if (first_init_pfn == ULONG_MAX) { + pgdat_resize_unlock(pgdat, &flags); pgdat_init_report_one_done(); return 0; } - /* Bind memory initialisation thread to a local node if possible */ - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(current, cpumask); - /* Sanity check boundaries */ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); @@ -1594,6 +1584,7 @@ static int __init deferred_init_memmap(void *data) epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); deferred_free_pages(nid, zid, spfn, epfn); } + pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); @@ -1604,6 +1595,117 @@ static int __init deferred_init_memmap(void *data) pgdat_init_report_one_done(); return 0; } + +/* + * During boot we initialize deferred pages on-demand, as needed, but once + * page_alloc_init_late() has finished, the deferred pages are all initialized, + * and we can permanently disable that path. + */ +static DEFINE_STATIC_KEY_TRUE(deferred_pages); + +/* + * If this zone has deferred pages, try to grow it by initializing enough + * deferred pages to satisfy the allocation specified by order, rounded up to + * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments + * of SECTION_SIZE bytes by initializing struct pages in increments of + * PAGES_PER_SECTION * sizeof(struct page) bytes. + * + * Return true when zone was grown, otherwise return false. We return true even + * when we grow less than requested, to let the caller decide if there are + * enough pages to satisfy the allocation. + * + * Note: We use noinline because this function is needed only during boot, and + * it is called from a __ref function _deferred_grow_zone. This way we are + * making sure that it is not inlined into permanent text section. + */ +static noinline bool __init +deferred_grow_zone(struct zone *zone, unsigned int order) +{ + int zid = zone_idx(zone); + int nid = zone_to_nid(zone); + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + unsigned long nr_pages = 0; + unsigned long first_init_pfn, spfn, epfn, t, flags; + unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + phys_addr_t spa, epa; + u64 i; + + /* Only the last zone may have deferred pages */ + if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) + return false; + + pgdat_resize_lock(pgdat, &flags); + + /* + * If deferred pages have been initialized while we were waiting for + * the lock, return true, as the zone was grown. The caller will retry + * this zone. We won't return to this function since the caller also + * has this static branch. + */ + if (!static_branch_unlikely(&deferred_pages)) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + /* + * If someone grew this zone while we were waiting for spinlock, return + * true, as there might be enough pages already. + */ + if (first_deferred_pfn != pgdat->first_deferred_pfn) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); + + if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + pgdat_resize_unlock(pgdat, &flags); + return false; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + + while (spfn < epfn && nr_pages < nr_pages_needed) { + t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); + first_deferred_pfn = min(t, epfn); + nr_pages += deferred_init_pages(nid, zid, spfn, + first_deferred_pfn); + spfn = first_deferred_pfn; + } + + if (nr_pages >= nr_pages_needed) + break; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); + deferred_free_pages(nid, zid, spfn, epfn); + + if (first_deferred_pfn == epfn) + break; + } + pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat_resize_unlock(pgdat, &flags); + + return nr_pages > 0; +} + +/* + * deferred_grow_zone() is __init, but it is called from + * get_page_from_freelist() during early boot until deferred_pages permanently + * disables this call. This is why we have refdata wrapper to avoid warning, + * and to ensure that the function body gets unloaded. + */ +static bool __ref +_deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return deferred_grow_zone(zone, order); +} + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) @@ -1622,6 +1724,12 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); + /* + * We initialized the rest of the deferred pages. Permanently disable + * on-demand struct page initialization. + */ + static_branch_disable(&deferred_pages); + /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif @@ -1635,16 +1743,38 @@ void __init page_alloc_init_late(void) } #ifdef CONFIG_CMA +static void __init adjust_present_page_count(struct page *page, long count) +{ + struct zone *zone = page_zone(page); + + /* We don't need to hold a lock since it is boot-up process */ + zone->present_pages += count; +} + /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { unsigned i = pageblock_nr_pages; + unsigned long pfn = page_to_pfn(page); struct page *p = page; + int nid = page_to_nid(page); + + /* + * ZONE_MOVABLE will steal present pages from other zones by + * changing page links so page_zone() is changed. Before that, + * we need to adjust previous zone's page count first. + */ + adjust_present_page_count(page, -pageblock_nr_pages); do { __ClearPageReserved(p); set_page_count(p, 0); - } while (++p, --i); + + /* Steal pages from other zones */ + set_page_links(p, ZONE_MOVABLE, nid, pfn); + } while (++p, ++pfn, --i); + + adjust_present_page_count(page, pageblock_nr_pages); set_pageblock_migratetype(page, MIGRATE_CMA); @@ -1906,7 +2036,9 @@ static int move_freepages(struct zone *zone, * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ - VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); + VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && + pfn_valid(page_to_pfn(end_page)) && + page_zone(start_page) != page_zone(end_page)); #endif if (num_movable) @@ -2412,10 +2544,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) local_irq_save(flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) { + if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; - } local_irq_restore(flags); } #endif @@ -2437,10 +2567,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - if (pcp->count) { + if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; - } local_irq_restore(flags); } @@ -2664,7 +2792,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); - pcp->count -= batch; } } @@ -2762,7 +2889,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = min_wmark_pages(zone) + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -3038,12 +3165,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); -#endif - /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead @@ -3070,10 +3191,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } #ifdef CONFIG_CMA - if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + if (!list_empty(&area->free_list[MIGRATE_CMA])) return true; - } #endif if (alloc_harder && !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) @@ -3093,13 +3212,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); - long cma_pages = 0; - -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); -#endif /* * Fast check for order-0 only. If this fails then the reserves @@ -3108,7 +3220,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx]) return true; return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, @@ -3199,6 +3311,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, ac_classzone_idx(ac), alloc_flags)) { int ret; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * Watermark failed for this zone, but see if we can + * grow this zone if it contains deferred pages. + */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) @@ -3240,6 +3362,14 @@ try_this_zone: reserve_highatomic_pageblock(page, zone, order); return page; + } else { +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* Try again if zone has deferred pages */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif } } @@ -3590,7 +3720,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return false; /* this guy won't enter reclaim */ - if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + if (current->flags & PF_MEMALLOC) return false; /* We're only interested __GFP_FS allocations for now */ @@ -3679,16 +3809,18 @@ retry: return page; } -static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, + const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; + enum zone_type high_zoneidx = ac->high_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, order, ac->high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -3724,10 +3856,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif return alloc_flags; } @@ -3967,7 +4095,7 @@ retry_cpuset: goto nopage; if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); /* * The adjusted alloc_flags might result in immediate success, so try @@ -4025,7 +4153,7 @@ retry_cpuset: retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) @@ -4194,9 +4322,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) - *alloc_flags |= ALLOC_CMA; - return true; } @@ -4606,6 +4731,13 @@ long si_mem_available(void) min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + /* + * Part of the kernel memory, which can be released under memory + * pressure. + */ + available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> + PAGE_SHIFT; + if (available < 0) available = 0; return available; @@ -5328,6 +5460,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; + struct page *page; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP struct memblock_region *r = NULL, *tmp; #endif @@ -5350,17 +5483,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Skip to the pfn preceding the next valid one (or - * end_pfn), such that we hit a valid pfn (or end_pfn) - * on our next iteration of the loop. - */ - pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; -#endif + if (!early_pfn_valid(pfn)) continue; - } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) @@ -5389,6 +5513,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #endif not_early: + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zone, nid); + if (context == MEMMAP_HOTPLUG) + SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -5405,15 +5534,8 @@ not_early: * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { - struct page *page = pfn_to_page(pfn); - - __init_single_page(page, pfn, zone, nid, - context != MEMMAP_HOTPLUG); set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); - } else { - __init_single_pfn(pfn, zone, nid, - context != MEMMAP_HOTPLUG); } } } @@ -6082,6 +6204,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; + unsigned long node_end_pfn = 0; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6109,9 +6232,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long movable_size = 0; size = zone->spanned_pages; realsize = freesize = zone->present_pages; + if (zone_end_pfn(zone) > node_end_pfn) + node_end_pfn = zone_end_pfn(zone); + /* * Adjust freesize so that it accounts for how much memory @@ -6160,12 +6287,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone_seqlock_init(zone); zone_pcp_init(zone); - if (!size) + /* + * The size of the CMA area is unknown now so we need to + * prepare the memory for the usemap at maximum. + */ + if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE && + pgdat->node_spanned_pages) { + movable_size = node_end_pfn - pgdat->node_start_pfn; + } + + if (!size && !movable_size) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + if (movable_size) { + zone->zone_start_pfn = pgdat->node_start_pfn; + zone->spanned_pages = movable_size; + setup_usemap(pgdat, zone, + pgdat->node_start_pfn, movable_size); + init_currently_empty_zone(zone, + pgdat->node_start_pfn, movable_size); + } else { + setup_usemap(pgdat, zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + } memmap_init(size, nid, j, zone_start_pfn); } } @@ -6195,10 +6340,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = alloc_remap(pgdat->node_id, size); - if (!map) - map = memblock_virt_alloc_node_nopanic(size, - pgdat->node_id); + map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); pgdat->node_mem_map = map + offset; } pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", @@ -6247,7 +6389,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, alloc_node_mem_map(pgdat); - reset_deferred_meminit(pgdat); +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +#endif free_area_init_core(pgdat); } @@ -6477,7 +6627,18 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* - * If movablecore=nn[KMG] was specified, calculate what size of + * If kernelcore=nn% or movablecore=nn% was specified, calculate the + * amount of necessary memory. + */ + if (required_kernelcore_percent) + required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / + 10000UL; + if (required_movablecore_percent) + required_movablecore = (totalpages * 100 * required_movablecore_percent) / + 10000UL; + + /* + * If movablecore= was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -6717,18 +6878,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) zero_resv_unavail(); } -static int __init cmdline_parse_core(char *p, unsigned long *core) +static int __init cmdline_parse_core(char *p, unsigned long *core, + unsigned long *percent) { unsigned long long coremem; + char *endptr; + if (!p) return -EINVAL; - coremem = memparse(p, &p); - *core = coremem >> PAGE_SHIFT; + /* Value may be a percentage of total memory, otherwise bytes */ + coremem = simple_strtoull(p, &endptr, 0); + if (*endptr == '%') { + /* Paranoid check for percent values greater than 100 */ + WARN_ON(coremem > 100); - /* Paranoid check that UL is enough for the coremem value */ - WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *percent = coremem; + } else { + coremem = memparse(p, &p); + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *core = coremem >> PAGE_SHIFT; + *percent = 0UL; + } return 0; } @@ -6744,7 +6917,8 @@ static int __init cmdline_parse_kernelcore(char *p) return 0; } - return cmdline_parse_core(p, &required_kernelcore); + return cmdline_parse_core(p, &required_kernelcore, + &required_kernelcore_percent); } /* @@ -6753,7 +6927,8 @@ static int __init cmdline_parse_kernelcore(char *p) */ static int __init cmdline_parse_movablecore(char *p) { - return cmdline_parse_core(p, &required_movablecore); + return cmdline_parse_core(p, &required_movablecore, + &required_movablecore_percent); } early_param("kernelcore", cmdline_parse_kernelcore); @@ -6977,13 +7152,15 @@ static void setup_per_zone_lowmem_reserve(void) struct zone *lower_zone; idx--; - - if (sysctl_lowmem_reserve_ratio[idx] < 1) - sysctl_lowmem_reserve_ratio[idx] = 1; - lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = managed_pages / - sysctl_lowmem_reserve_ratio[idx]; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) { + sysctl_lowmem_reserve_ratio[idx] = 0; + lower_zone->lowmem_reserve[j] = 0; + } else { + lower_zone->lowmem_reserve[j] = + managed_pages / sysctl_lowmem_reserve_ratio[idx]; + } managed_pages += lower_zone->managed_pages; } } @@ -7597,7 +7774,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, cc->mode, MR_CMA); + NULL, 0, cc->mode, MR_CONTIG_RANGE); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -7617,11 +7794,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES - * aligned, however it's the caller's responsibility to guarantee that - * we are the only thread that changes migrate type of pageblocks the - * pages fall in. + * aligned. The PFN range must belong to a single zone. * - * The PFN range must belong to a single zone. + * The first thing this routine does is attempt to MIGRATE_ISOLATE all + * pageblocks in the range. Once isolated, the pageblocks should not + * be modified by others. * * Returns zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and @@ -7774,7 +7951,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. |