summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 19:05:45 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-21 19:05:45 -0800
commitdf32e43a54d04eda35d2859beaf90e3864d53288 (patch)
tree7a61cf658b2949bd426285eb9902be7758ced1ba /mm
parentfbd918a2026d0464ce9c23f57b7de4bcfccdc2e6 (diff)
parent78d5506e82b21a1a1de68c24182db2c2fe521422 (diff)
downloadlinux-stable-df32e43a54d04eda35d2859beaf90e3864d53288.tar.gz
linux-stable-df32e43a54d04eda35d2859beaf90e3864d53288.tar.bz2
linux-stable-df32e43a54d04eda35d2859beaf90e3864d53288.zip
Merge branch 'akpm' (incoming from Andrew)
Merge first patch-bomb from Andrew Morton: - a couple of misc things - inotify/fsnotify work from Jan - ocfs2 updates (partial) - about half of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (117 commits) mm/migrate: remove unused function, fail_migrate_page() mm/migrate: remove putback_lru_pages, fix comment on putback_movable_pages mm/migrate: correct failure handling if !hugepage_migration_support() mm/migrate: add comment about permanent failure path mm, page_alloc: warn for non-blockable __GFP_NOFAIL allocation failure mm: compaction: reset scanner positions immediately when they meet mm: compaction: do not mark unmovable pageblocks as skipped in async compaction mm: compaction: detect when scanners meet in isolate_freepages mm: compaction: reset cached scanner pfn's before reading them mm: compaction: encapsulate defer reset logic mm: compaction: trace compaction begin and end memcg, oom: lock mem_cgroup_print_oom_info sched: add tracepoints related to NUMA task migration mm: numa: do not automatically migrate KSM pages mm: numa: trace tasks that fail migration due to rate limiting mm: numa: limit scope of lock for NUMA migrate rate limiting mm: numa: make NUMA-migrate related functions static lib/show_mem.c: show num_poisoned_pages when oom mm/hwpoison: add '#' to hwpoison_inject mm/memblock: use WARN_ONCE when MAX_NUMNODES passed as input parameter ...
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c61
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h4
-rw-r--r--mm/ksm.c121
-rw-r--r--mm/memblock.c387
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/memory-failure.c10
-rw-r--r--mm/memory.c16
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/migrate.c89
-rw-r--r--mm/mlock.c18
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/nobootmem.c10
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/oom_kill.c51
-rw-r--r--mm/page_alloc.c89
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/percpu.c38
-rw-r--r--mm/rmap.c580
-rw-r--r--mm/sparse-vmemmap.c6
-rw-r--r--mm/sparse.c27
-rw-r--r--mm/swap.c278
-rw-r--r--mm/util.c36
-rw-r--r--mm/vmalloc.c20
26 files changed, 1145 insertions, 820 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index f58bcd016f43..3a91a2ea3d34 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
unsigned long flags;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
+ bool skipped_async_unsuitable = false;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (!cc->sync && last_pageblock_nr != pageblock_nr &&
!migrate_async_suitable(get_pageblock_migratetype(page))) {
cc->finished_update_migrate = true;
+ skipped_async_unsuitable = true;
goto next_pageblock;
}
@@ -627,8 +629,13 @@ next_pageblock:
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
- /* Update the pageblock-skip if the whole pageblock was scanned */
- if (low_pfn == end_pfn)
+ /*
+ * Update the pageblock-skip information and cached scanner pfn,
+ * if the whole pageblock was scanned without isolating any page.
+ * This is not done when pageblock was skipped due to being unsuitable
+ * for async compaction, so that eventual sync compaction can try.
+ */
+ if (low_pfn == end_pfn && !skipped_async_unsuitable)
update_pageblock_skip(cc, valid_page, nr_isolated, true);
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -660,7 +667,7 @@ static void isolate_freepages(struct zone *zone,
* is the end of the pageblock the migration scanner is using.
*/
pfn = cc->free_pfn;
- low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+ low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
/*
* Take care that if the migration scanner is at the end of the zone
@@ -676,7 +683,7 @@ static void isolate_freepages(struct zone *zone,
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+ for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
@@ -738,7 +745,14 @@ static void isolate_freepages(struct zone *zone,
/* split_free_page does not map the pages */
map_pages(freelist);
- cc->free_pfn = high_pfn;
+ /*
+ * If we crossed the migrate scanner, we want to keep it that way
+ * so that compact_finished() may detect this
+ */
+ if (pfn < low_pfn)
+ cc->free_pfn = max(pfn, zone->zone_start_pfn);
+ else
+ cc->free_pfn = high_pfn;
cc->nr_freepages = nr_freepages;
}
@@ -837,6 +851,10 @@ static int compact_finished(struct zone *zone,
/* Compaction run completes if the migrate and free scanner meet */
if (cc->free_pfn <= cc->migrate_pfn) {
+ /* Let the next compaction start anew. */
+ zone->compact_cached_migrate_pfn = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn = zone_end_pfn(zone);
+
/*
* Mark that the PG_migrate_skip information should be cleared
* by kswapd when it goes to sleep. kswapd does not set the
@@ -947,6 +965,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
}
/*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
+
+ /*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start but check that it
* is initialised by ensuring the values are within zone boundaries.
@@ -962,13 +988,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
zone->compact_cached_migrate_pfn = cc->migrate_pfn;
}
- /*
- * Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
- */
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
- __reset_isolation_suitable(zone);
+ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
migrate_prep_local();
@@ -1003,7 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
if (err) {
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- if (err == -ENOMEM) {
+ /*
+ * migrate_pages() may return -ENOMEM when scanners meet
+ * and we want compact_finished() to detect it
+ */
+ if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
ret = COMPACT_PARTIAL;
goto out;
}
@@ -1015,6 +1039,8 @@ out:
cc->nr_freepages -= release_freepages(&cc->freepages);
VM_BUG_ON(cc->nr_freepages != 0);
+ trace_mm_compaction_end(ret);
+
return ret;
}
@@ -1120,12 +1146,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
compact_zone(zone, cc);
if (cc->order > 0) {
- int ok = zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0);
- if (ok && cc->order >= zone->compact_order_failed)
- zone->compact_order_failed = cc->order + 1;
+ if (zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0))
+ compaction_defer_reset(zone, cc->order, false);
/* Currently async compaction is never deferred. */
- else if (!ok && cc->sync)
+ else if (cc->sync)
defer_compaction(zone, cc->order);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4e6d34..04306b9de90d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
*/
int PageHuge(struct page *page)
{
- compound_page_dtor *dtor;
-
if (!PageCompound(page))
return 0;
page = compound_head(page);
- dtor = get_compound_page_dtor(page);
-
- return dtor == free_huge_page;
+ return get_compound_page_dtor(page) == free_huge_page;
}
EXPORT_SYMBOL_GPL(PageHuge);
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
*/
int PageHeadHuge(struct page *page_head)
{
- compound_page_dtor *dtor;
-
if (!PageHead(page_head))
return 0;
- dtor = get_compound_page_dtor(page_head);
-
- return dtor == free_huge_page;
+ return get_compound_page_dtor(page_head) == free_huge_page;
}
-EXPORT_SYMBOL_GPL(PageHeadHuge);
pgoff_t __basepage_index(struct page *page)
{
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
- huge_page_size(h), huge_page_size(h), 0);
-
+ addr = memblock_virt_alloc_try_nid_nopanic(
+ huge_page_size(h), huge_page_size(h),
+ 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void)
#ifdef CONFIG_HIGHMEM
page = pfn_to_page(m->phys >> PAGE_SHIFT);
- free_bootmem_late((unsigned long)m,
- sizeof(struct huge_bootmem_page));
+ memblock_free_late(__pa(m),
+ sizeof(struct huge_bootmem_page));
#else
page = virt_to_page(m);
#endif
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ mmun_start = vma->vm_start;
+ mmun_end = vma->vm_end;
+ if (cow)
+ mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte)
- goto nomem;
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ break;
+ }
/* If the pagetables are shared don't copy or take references */
if (dst_pte == src_pte)
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
}
- return 0;
-nomem:
- return -ENOMEM;
+ if (cow)
+ mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+
+ return ret;
}
static int is_hugetlb_entry_migration(pte_t pte)
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page(pages[i]);
+ get_page_foll(pages[i]);
}
if (vmas)
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4c84678371eb..95487c71cad5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
return 0;
inject:
- printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
+ pr_info("Injecting memory failure at pfn %#lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
}
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa9692a..a346ba120e42 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -47,11 +47,9 @@ static inline void __get_page_tail_foll(struct page *page,
* page_cache_get_speculative()) on tail pages.
*/
VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- VM_BUG_ON(page_mapcount(page) < 0);
if (get_page_head)
atomic_inc(&page->first_page->_count);
- atomic_inc(&page->_mapcount);
+ get_huge_page_tail(page);
}
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff79dc95..3df141e5f3e0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page,
return new_page;
}
-int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
- unsigned int mapcount = page_mapcount(page);
- int referenced = 0;
+ int ret = SWAP_AGAIN;
int search_new_forks = 0;
VM_BUG_ON(!PageKsm(page));
+
+ /*
+ * Rely on the page lock to protect against concurrent modifications
+ * to that page's node of the stable tree.
+ */
VM_BUG_ON(!PageLocked(page));
stable_node = page_stable_node(page);
if (!stable_node)
- return 0;
+ return ret;
again:
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1928,113 +1931,16 @@ again:
if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
continue;
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
-
- referenced += page_referenced_one(page, vma,
- rmap_item->address, &mapcount, vm_flags);
- if (!search_new_forks || !mapcount)
- break;
- }
- anon_vma_unlock_read(anon_vma);
- if (!mapcount)
- goto out;
- }
- if (!search_new_forks++)
- goto again;
-out:
- return referenced;
-}
-
-int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
-{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
- int search_new_forks = 0;
-
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
-
- stable_node = page_stable_node(page);
- if (!stable_node)
- return SWAP_FAIL;
-again:
- hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- struct anon_vma *anon_vma = rmap_item->anon_vma;
- struct anon_vma_chain *vmac;
- struct vm_area_struct *vma;
-
- anon_vma_lock_read(anon_vma);
- anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
- 0, ULONG_MAX) {
- vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
- continue;
- /*
- * Initially we examine only the vma which covers this
- * rmap_item; but later, if there is still work to do,
- * we examine covering vmas in other mms: in case they
- * were forked from the original since ksmd passed.
- */
- if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = try_to_unmap_one(page, vma,
- rmap_item->address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page)) {
+ ret = rwc->rmap_one(page, vma,
+ rmap_item->address, rwc->arg);
+ if (ret != SWAP_AGAIN) {
anon_vma_unlock_read(anon_vma);
goto out;
}
- }
- anon_vma_unlock_read(anon_vma);
- }
- if (!search_new_forks++)
- goto again;
-out:
- return ret;
-}
-
-#ifdef CONFIG_MIGRATION
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
-{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
- int search_new_forks = 0;
-
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
-
- stable_node = page_stable_node(page);
- if (!stable_node)
- return ret;
-again:
- hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- struct anon_vma *anon_vma = rmap_item->anon_vma;
- struct anon_vma_chain *vmac;
- struct vm_area_struct *vma;
-
- anon_vma_lock_read(anon_vma);
- anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
- 0, ULONG_MAX) {
- vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
- continue;
- /*
- * Initially we examine only the vma which covers this
- * rmap_item; but later, if there is still work to do,
- * we examine covering vmas in other mms: in case they
- * were forked from the original since ksmd passed.
- */
- if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
- continue;
-
- ret = rmap_one(page, vma, rmap_item->address, arg);
- if (ret != SWAP_AGAIN) {
+ if (rwc->done && rwc->done(page)) {
anon_vma_unlock_read(anon_vma);
goto out;
}
@@ -2047,6 +1953,7 @@ out:
return ret;
}
+#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
struct stable_node *stable_node;
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477bb5558..1c2ef2c7edab 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,6 +21,9 @@
#include <linux/memblock.h>
#include <asm-generic/sections.h>
+#include <linux/io.h>
+
+#include "internal.h"
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = {
};
int memblock_debug __initdata_memblock;
+#ifdef CONFIG_MOVABLE_NODE
+bool movable_node_enabled __initdata_memblock = false;
+#endif
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
/**
* memblock_find_in_range_node - find free area in given range and node
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Find @size free area aligned to @align in the specified range and node.
*
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* RETURNS:
* Found address on success, 0 on failure.
*/
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
- phys_addr_t end, phys_addr_t size,
- phys_addr_t align, int nid)
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid)
{
int ret;
phys_addr_t kernel_end;
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
- return memblock_find_in_range_node(start, end, size, align,
- MAX_NUMNODES);
+ return memblock_find_in_range_node(size, align, start, end,
+ NUMA_NO_NODE);
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -255,6 +261,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
+ type->regions[0].flags = 0;
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
}
@@ -265,6 +272,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
if (memblock.reserved.regions == memblock_reserved_init_regions)
return 0;
+ /*
+ * Don't allow nobootmem allocator to free reserved memory regions
+ * array if
+ * - CONFIG_DEBUG_FS is enabled;
+ * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled;
+ * - reserved memory regions array have been resized during boot.
+ * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved"
+ * will show garbage instead of state of memory reservations.
+ */
+ if (IS_ENABLED(CONFIG_DEBUG_FS) &&
+ !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+ return 0;
+
*addr = __pa(memblock.reserved.regions);
return PAGE_ALIGN(sizeof(struct memblock_region) *
@@ -405,7 +425,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
if (this->base + this->size != next->base ||
memblock_get_region_node(this) !=
- memblock_get_region_node(next)) {
+ memblock_get_region_node(next) ||
+ this->flags != next->flags) {
BUG_ON(this->base + this->size > next->base);
i++;
continue;
@@ -425,13 +446,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
* @base: base address of the new region
* @size: size of the new region
* @nid: node id of the new region
+ * @flags: flags of the new region
*
* Insert new memblock region [@base,@base+@size) into @type at @idx.
* @type must already have extra room to accomodate the new region.
*/
static void __init_memblock memblock_insert_region(struct memblock_type *type,
int idx, phys_addr_t base,
- phys_addr_t size, int nid)
+ phys_addr_t size,
+ int nid, unsigned long flags)
{
struct memblock_region *rgn = &type->regions[idx];
@@ -439,6 +462,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
rgn->base = base;
rgn->size = size;
+ rgn->flags = flags;
memblock_set_region_node(rgn, nid);
type->cnt++;
type->total_size += size;
@@ -450,6 +474,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* @base: base address of the new region
* @size: size of the new region
* @nid: nid of the new region
+ * @flags: flags of the new region
*
* Add new memblock region [@base,@base+@size) into @type. The new region
* is allowed to overlap with existing ones - overlaps don't affect already
@@ -460,7 +485,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* 0 on success, -errno on failure.
*/
static int __init_memblock memblock_add_region(struct memblock_type *type,
- phys_addr_t base, phys_addr_t size, int nid)
+ phys_addr_t base, phys_addr_t size,
+ int nid, unsigned long flags)
{
bool insert = false;
phys_addr_t obase = base;
@@ -475,6 +501,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
+ type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
@@ -505,7 +532,8 @@ repeat:
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
- rbase - base, nid);
+ rbase - base, nid,
+ flags);
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
@@ -515,7 +543,8 @@ repeat:
if (base < end) {
nr_new++;
if (insert)
- memblock_insert_region(type, i, base, end - base, nid);
+ memblock_insert_region(type, i, base, end - base,
+ nid, flags);
}
/*
@@ -537,12 +566,13 @@ repeat:
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
- return memblock_add_region(&memblock.memory, base, size, nid);
+ return memblock_add_region(&memblock.memory, base, size, nid, 0);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
+ return memblock_add_region(&memblock.memory, base, size,
+ MAX_NUMNODES, 0);
}
/**
@@ -597,7 +627,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->size -= base - rbase;
type->total_size -= base - rbase;
memblock_insert_region(type, i, rbase, base - rbase,
- memblock_get_region_node(rgn));
+ memblock_get_region_node(rgn),
+ rgn->flags);
} else if (rend > end) {
/*
* @rgn intersects from above. Split and redo the
@@ -607,7 +638,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->size -= end - rbase;
type->total_size -= end - rbase;
memblock_insert_region(type, i--, rbase, end - rbase,
- memblock_get_region_node(rgn));
+ memblock_get_region_node(rgn),
+ rgn->flags);
} else {
/* @rgn is fully contained, record it */
if (!*end_rgn)
@@ -643,28 +675,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size,
+ (unsigned long long)base + size - 1,
(void *)_RET_IP_);
return __memblock_remove(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+static int __init_memblock memblock_reserve_region(phys_addr_t base,
+ phys_addr_t size,
+ int nid,
+ unsigned long flags)
{
struct memblock_type *_rgn = &memblock.reserved;
- memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
+ memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size,
- (void *)_RET_IP_);
+ (unsigned long long)base + size - 1,
+ flags, (void *)_RET_IP_);
- return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
+ return memblock_add_region(_rgn, base, size, nid, flags);
+}
+
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+}
+
+/**
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and mark it with flag
+ * MEMBLOCK_HOTPLUG.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
+
+ memblock_merge_regions(type);
+ return 0;
+}
+
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and clear flag
+ * MEMBLOCK_HOTPLUG for the isolated regions.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_clear_region_flags(&type->regions[i],
+ MEMBLOCK_HOTPLUG);
+
+ memblock_merge_regions(type);
+ return 0;
}
/**
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
@@ -693,13 +786,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
int mi = *idx & 0xffffffff;
int ri = *idx >> 32;
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
for ( ; mi < mem->cnt; mi++) {
struct memblock_region *m = &mem->regions[mi];
phys_addr_t m_start = m->base;
phys_addr_t m_end = m->base + m->size;
/* only memory regions are associated with nodes, check it */
- if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
continue;
/* scan areas before each reservation for intersection */
@@ -740,12 +836,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
/**
* __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
*
* Reverse of __next_free_mem_range().
+ *
+ * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't
+ * be able to hot-remove hotpluggable memory used by the kernel. So this
+ * function skip hotpluggable regions if needed when allocating memory for the
+ * kernel.
*/
void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
phys_addr_t *out_start,
@@ -756,6 +857,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
int mi = *idx & 0xffffffff;
int ri = *idx >> 32;
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
if (*idx == (u64)ULLONG_MAX) {
mi = mem->cnt - 1;
ri = rsv->cnt;
@@ -767,7 +871,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;
/* only memory regions are associated with nodes, check it */
- if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
+ continue;
+
+ /* skip hotpluggable memory regions if needed */
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;
/* scan areas before each reservation for intersection */
@@ -837,18 +945,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
* memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for
* @size: size of area to set node ID for
+ * @type: memblock type to set node ID for
* @nid: node ID to set
*
- * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
* Regions which cross the area boundaries are split as necessary.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
- int nid)
+ struct memblock_type *type, int nid)
{
- struct memblock_type *type = &memblock.memory;
int start_rgn, end_rgn;
int i, ret;
@@ -870,13 +978,13 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
{
phys_addr_t found;
- if (WARN_ON(!align))
- align = __alignof__(long long);
+ if (!align)
+ align = SMP_CACHE_BYTES;
/* align @size to avoid excessive fragmentation on reserved array */
size = round_up(size, align);
- found = memblock_find_in_range_node(0, max_addr, size, align, nid);
+ found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
if (found && !memblock_reserve(found, size))
return found;
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
+/**
+ * memblock_virt_alloc_internal - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region to allocate (phys address)
+ * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Also, allocation may fall back
+ * to any node in the system if the specified node can not
+ * hold the requested memory.
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
+ *
+ * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ *
+ * The phys address of allocated boot memory block is converted to virtual and
+ * allocated memory is reset to 0.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init memblock_virt_alloc_internal(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ phys_addr_t alloc;
+ void *ptr;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ /*
+ * Detect any accidental use of these APIs after slab is ready, as at
+ * this moment memblock may be deinitialized already and its
+ * internal data may be destroyed (after execution of free_all_bootmem)
+ */
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, nid);
+
+ if (!align)
+ align = SMP_CACHE_BYTES;
+
+ /* align @size to avoid excessive fragmentation on reserved array */
+ size = round_up(size, align);
+
+again:
+ alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
+ nid);
+ if (alloc)
+ goto done;
+
+ if (nid != NUMA_NO_NODE) {
+ alloc = memblock_find_in_range_node(size, align, min_addr,
+ max_addr, NUMA_NO_NODE);
+ if (alloc)
+ goto done;
+ }
+
+ if (min_addr) {
+ min_addr = 0;
+ goto again;
+ } else {
+ goto error;
+ }
+
+done:
+ memblock_reserve(alloc, size);
+ ptr = phys_to_virt(alloc);
+ memset(ptr, 0, size);
+
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks. This is because many of these blocks
+ * are only referred via the physical address which is not
+ * looked up by kmemleak.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+
+ return ptr;
+
+error:
+ return NULL;
+}
+
+/**
+ * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
+ * additional debug information (including caller info), if enabled.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_nopanic(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+ return memblock_virt_alloc_internal(size, align, min_addr,
+ max_addr, nid);
+}
+
+/**
+ * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * which provides debug information (including caller info), if enabled,
+ * and panics if the request can not be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+ ptr = memblock_virt_alloc_internal(size, align,
+ min_addr, max_addr, nid);
+ if (ptr)
+ return ptr;
+
+ panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr);
+ return NULL;
+}
+
+/**
+ * __memblock_free_early - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
+{
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ __memblock_remove(&memblock.reserved, base, size);
+}
+
+/*
+ * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * @addr: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are released directly
+ * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+{
+ u64 cursor, end;
+
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ cursor = PFN_UP(base);
+ end = PFN_DOWN(base + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
/*
* Remaining API functions
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
{
unsigned long long base, size;
+ unsigned long flags;
int i;
pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
base = rgn->base;
size = rgn->size;
+ flags = rgn->flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
- pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
- name, i, base, base + size - 1, size, nid_buf);
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
+ name, i, base, base + size - 1, size, nid_buf, flags);
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7caff36180cd..67dd2a881433 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1688,13 +1688,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct cgroup *task_cgrp;
- struct cgroup *mem_cgrp;
/*
- * Need a buffer in BSS, can't rely on allocations. The code relies
- * on the assumption that OOM is serialized for memory controller.
- * If this assumption is broken, revisit this code.
+ * protects memcg_name and makes sure that parallel ooms do not
+ * interleave
*/
+ static DEFINE_SPINLOCK(oom_info_lock);
+ struct cgroup *task_cgrp;
+ struct cgroup *mem_cgrp;
static char memcg_name[PATH_MAX];
int ret;
struct mem_cgroup *iter;
@@ -1703,6 +1703,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
if (!p)
return;
+ spin_lock(&oom_info_lock);
rcu_read_lock();
mem_cgrp = memcg->css.cgroup;
@@ -1771,6 +1772,7 @@ done:
pr_cont("\n");
}
+ spin_unlock(&oom_info_lock);
}
/*
@@ -3000,7 +3002,8 @@ static DEFINE_MUTEX(set_limit_mutex);
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+ (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) ==
+ KMEM_ACCOUNTED_MASK;
}
/*
@@ -3126,7 +3129,7 @@ int memcg_cache_id(struct mem_cgroup *memcg)
* But when we create a new cache, we can call this as well if its parent
* is kmem-limited. That will have to hold set_limit_mutex as well.
*/
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+static int memcg_update_cache_sizes(struct mem_cgroup *memcg)
{
int num, ret;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fabe55046c1d..b25ed321e667 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
}
/*
- * Dirty cache page page
+ * Dirty pagecache page
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
@@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags)
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
- putback_lru_pages(&pagelist);
+ if (!list_empty(&pagelist)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..86487dfa5e59 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/dma-debug.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
+ debug_dma_assert_idle(src);
+
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+
+static struct kmem_cache *page_ptl_cachep;
+
+void __init ptlock_cache_init(void)
+{
+ page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+ SLAB_PANIC, NULL);
+}
+
bool ptlock_alloc(struct page *page)
{
spinlock_t *ptl;
- ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+ ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
page->ptl = ptl;
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page)
void ptlock_free(struct page *page)
{
- kfree(page->ptl);
+ kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235502db..cc2ab37220b7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
-#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/pagevec.h>
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
}
/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic() */
+ * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
static int __ref ensure_zone_is_initialized(struct zone *zone,
unsigned long start_pfn, unsigned long num_pages)
{
@@ -1446,6 +1445,7 @@ static int __init cmdline_parse_movable_node(char *p)
* the kernel away from hotpluggable memory.
*/
memblock_set_bottom_up(true);
+ movable_node_enabled = true;
#else
pr_warn("movable_node option not supported\n");
#endif
diff --git a/mm/migrate.c b/mm/migrate.c
index 9194375b2307..a8025befc323 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -72,28 +72,12 @@ int migrate_prep_local(void)
}
/*
- * Add isolated pages on the list back to the LRU under page lock
- * to avoid leaking evictable pages back onto unevictable list.
- */
-void putback_lru_pages(struct list_head *l)
-{
- struct page *page;
- struct page *page2;
-
- list_for_each_entry_safe(page, page2, l, lru) {
- list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- putback_lru_page(page);
- }
-}
-
-/*
* Put previously isolated pages back onto the appropriate lists
* from where they were once taken off for compaction/migration.
*
- * This function shall be used instead of putback_lru_pages(),
- * whenever the isolated pageset has been built by isolate_migratepages_range()
+ * This function shall be used whenever the isolated pageset has been
+ * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
+ * and isolate_huge_page().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -199,7 +183,12 @@ out:
*/
static void remove_migration_ptes(struct page *old, struct page *new)
{
- rmap_walk(new, remove_migration_pte, old);
+ struct rmap_walk_control rwc = {
+ .rmap_one = remove_migration_pte,
+ .arg = old,
+ };
+
+ rmap_walk(new, &rwc);
}
/*
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* Migration functions
***********************************************************/
-/* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
-{
- return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
/*
* Common logic to directly migrate a single page suitable for
* pages that do not use PagePrivate/PagePrivate2.
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
{
int rc = 0;
int *result = NULL;
- struct page *new_hpage = get_new_page(hpage, private, &result);
+ struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
/*
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* tables or check whether the hugepage is pmd-based or not before
* kicking migration.
*/
- if (!hugepage_migration_support(page_hstate(hpage)))
+ if (!hugepage_migration_support(page_hstate(hpage))) {
+ putback_active_hugepage(hpage);
return -ENOSYS;
+ }
+ new_hpage = get_new_page(hpage, private, &result);
if (!new_hpage)
return -ENOMEM;
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
nr_succeeded++;
break;
default:
- /* Permanent failure */
+ /*
+ * Permanent failure (-EBUSY, -ENOSYS, etc.):
+ * unlike -EAGAIN case, the failed page is
+ * removed from migration page list and not
+ * retried in the next outer loop.
+ */
nr_failed++;
break;
}
@@ -1594,31 +1583,38 @@ bool migrate_ratelimited(int node)
}
/* Returns true if the node is migrate rate-limited after the update */
-bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
+ unsigned long nr_pages)
{
- bool rate_limited = false;
-
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
- spin_lock(&pgdat->numabalancing_migrate_lock);
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+ spin_lock(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies +
msecs_to_jiffies(migrate_interval_millisecs);
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
}
- if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
- rate_limited = true;
- else
- pgdat->numabalancing_migrate_nr_pages += nr_pages;
- spin_unlock(&pgdat->numabalancing_migrate_lock);
-
- return rate_limited;
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
+ trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
+ nr_pages);
+ return true;
+ }
+
+ /*
+ * This is an unlocked non-atomic update so errors are possible.
+ * The consequences are failing to migrate when we potentiall should
+ * have which is not severe enough to warrant locking. If it is ever
+ * a problem, it can be converted to a per-cpu counter.
+ */
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ return false;
}
-int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
@@ -1705,7 +1701,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
if (nr_remaining) {
- putback_lru_pages(&migratepages);
+ if (!list_empty(&migratepages)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
isolated = 0;
} else
count_vm_numa_event(NUMA_PAGE_MIGRATE);
diff --git a/mm/mlock.c b/mm/mlock.c
index 192e6eebe4f2..10819ed4df3e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
lru_add_drain_all(); /* flush pagevec */
- down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
- locked = len >> PAGE_SHIFT;
- locked += current->mm->locked_vm;
-
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
+ locked = len >> PAGE_SHIFT;
+
+ down_write(&current->mm->mmap_sem);
+
+ locked += current->mm->locked_vm;
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
+
up_write(&current->mm->mmap_sem);
if (!error)
error = __mm_populate(start, len, 0);
@@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
- down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
+
+ down_write(&current->mm->mmap_sem);
ret = do_mlock(start, len, 0);
up_write(&current->mm->mmap_sem);
+
return ret;
}
@@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (flags & MCL_CURRENT)
lru_add_drain_all(); /* flush pagevec */
- down_write(&current->mm->mmap_sem);
-
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
ret = -ENOMEM;
+ down_write(&current->mm->mmap_sem);
+
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1e..a0e7153a79e6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -1190,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
+static inline int mlock_future_check(struct mm_struct *mm,
+ unsigned long flags,
+ unsigned long len)
+{
+ unsigned long locked, lock_limit;
+
+ /* mlock MCL_FUTURE? */
+ if (flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+ return 0;
+}
+
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
@@ -1251,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- /* mlock MCL_FUTURE? */
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ if (mlock_future_check(mm, vm_flags, len))
+ return -EAGAIN;
if (file) {
struct inode *inode = file_inode(file);
@@ -2591,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
if (error & ~PAGE_MASK)
return error;
- /*
- * mlock MCL_FUTURE?
- */
- if (mm->def_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ error = mlock_future_check(mm, mm->def_flags, len);
+ if (error)
+ return error;
/*
* mm->mmap_sem is required to protect against another thread
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bb53a6591aea..7332c1785744 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/perf_event.h>
+#include <linux/ksm.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
ptent = *pte;
page = vm_normal_page(vma, addr, oldpte);
- if (page) {
+ if (page && !PageKsm(page)) {
if (!pte_numa(oldpte)) {
ptent = pte_mknuma(ptent);
set_pte_at(mm, addr, pte, ptent);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d374655..19121ceb8874 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (limit > memblock.current_limit)
limit = memblock.current_limit;
- addr = memblock_find_in_range_node(goal, limit, size, align, nid);
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid);
if (!addr)
return NULL;
@@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void)
phys_addr_t start, end, size;
u64 i;
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
count += __free_memory_core(start, end);
/* free range that is used for reserved array if we allocate it */
@@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void)
reset_all_zones_managed_pages();
/*
- * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+ * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
@@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
restart:
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
if (ptr)
return ptr;
@@ -299,7 +299,7 @@ again:
if (ptr)
return ptr;
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
goal, limit);
if (ptr)
return ptr;
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093adad9c..8740213b1647 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn;
struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600a6163..054ff47c4478 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
- * @tsk: task struct of which task to consider
+ * @start: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*/
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
+static bool has_intersects_mems_allowed(struct task_struct *start,
const nodemask_t *mask)
{
- struct task_struct *start = tsk;
+ struct task_struct *tsk;
+ bool ret = false;
- do {
+ rcu_read_lock();
+ for_each_thread(start, tsk) {
if (mask) {
/*
* If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- if (mempolicy_nodemask_intersects(tsk, mask))
- return true;
+ ret = mempolicy_nodemask_intersects(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
- if (cpuset_mems_allowed_intersects(current, tsk))
- return true;
+ ret = cpuset_mems_allowed_intersects(current, tsk);
}
- } while_each_thread(start, tsk);
+ if (ret)
+ break;
+ }
+ rcu_read_unlock();
- return false;
+ return ret;
}
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
*/
struct task_struct *find_lock_task_mm(struct task_struct *p)
{
- struct task_struct *t = p;
+ struct task_struct *t;
- do {
+ rcu_read_lock();
+
+ for_each_thread(p, t) {
task_lock(t);
if (likely(t->mm))
- return t;
+ goto found;
task_unlock(t);
- } while_each_thread(p, t);
+ }
+ t = NULL;
+found:
+ rcu_read_unlock();
- return NULL;
+ return t;
}
/* return true if the task is not adequate as candidate victim task. */
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long chosen_points = 0;
rcu_read_lock();
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
unsigned int points;
switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
chosen = p;
chosen_points = points;
}
- } while_each_thread(g, p);
+ }
if (chosen)
get_task_struct(chosen);
rcu_read_unlock();
@@ -406,7 +414,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
{
struct task_struct *victim = p;
struct task_struct *child;
- struct task_struct *t = p;
+ struct task_struct *t;
struct mm_struct *mm;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +445,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* still freeing memory.
*/
read_lock(&tasklist_lock);
- do {
+ for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -455,13 +463,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
get_task_struct(victim);
}
}
- } while_each_thread(p, t);
+ }
read_unlock(&tasklist_lock);
- rcu_read_lock();
p = find_lock_task_mm(victim);
if (!p) {
- rcu_read_unlock();
put_task_struct(victim);
return;
} else if (victim != p) {
@@ -487,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
+ rcu_read_lock();
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5248fe070aa4..533e2147d14f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2072,13 +2072,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
return;
/*
- * Walking all memory to count page types is very expensive and should
- * be inhibited in non-blockable contexts.
- */
- if (!(gfp_mask & __GFP_WAIT))
- filter |= SHOW_MEM_FILTER_PAGE_COUNT;
-
- /*
* This documents exceptions given to allocations in certain
* contexts that are allowed to allocate outside current's set
* of allowed nodes.
@@ -2242,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
preferred_zone, migratetype);
if (page) {
preferred_zone->compact_blockskip_flush = false;
- preferred_zone->compact_considered = 0;
- preferred_zone->compact_defer_shift = 0;
- if (order >= preferred_zone->compact_order_failed)
- preferred_zone->compact_order_failed = order + 1;
+ compaction_defer_reset(preferred_zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
@@ -2535,8 +2525,15 @@ rebalance:
}
/* Atomic allocations - we can't balance anything */
- if (!wait)
+ if (!wait) {
+ /*
+ * All existing users of the deprecated __GFP_NOFAIL are
+ * blockable, so warn of any new users that actually allow this
+ * type of allocation to fail.
+ */
+ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
+ }
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
@@ -3901,6 +3898,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
struct page *page;
unsigned long block_migratetype;
int reserve;
+ int old_reserve;
/*
* Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3920,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
* future allocation of hugepages at runtime.
*/
reserve = min(2, reserve);
+ old_reserve = zone->nr_migrate_reserve_block;
+
+ /* When memory hot-add, we almost always need to do nothing */
+ if (reserve == old_reserve)
+ return;
+ zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
if (!pfn_valid(pfn))
@@ -3959,6 +3963,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
reserve--;
continue;
}
+ } else if (!old_reserve) {
+ /*
+ * At boot time we don't need to scan the whole zone
+ * for turning off MIGRATE_RESERVE.
+ */
+ break;
}
/*
@@ -4209,7 +4219,6 @@ static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
- struct pglist_data *pgdat = zone->zone_pgdat;
size_t alloc_size;
/*
@@ -4225,7 +4234,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node_nopanic(pgdat, alloc_size);
+ memblock_virt_alloc_node_nopanic(
+ alloc_size, zone->zone_pgdat->node_id);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4355,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
#endif
/**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
*/
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
@@ -4363,9 +4374,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
end_pfn = min(end_pfn, max_low_pfn);
if (start_pfn < end_pfn)
- free_bootmem_node(NODE_DATA(this_nid),
- PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT);
+ memblock_free_early_nid(PFN_PHYS(start_pfn),
+ (end_pfn - start_pfn) << PAGE_SHIFT,
+ this_nid);
}
}
@@ -4636,8 +4647,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
- usemapsize);
+ zone->pageblock_flags =
+ memblock_virt_alloc_node_nopanic(usemapsize,
+ pgdat->node_id);
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4843,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node_nopanic(pgdat, size);
+ map = memblock_virt_alloc_node_nopanic(size,
+ pgdat->node_id);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5025,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+ struct memblock_type *type = &memblock.memory;
+
+ /* Need to find movable_zone earlier when movable_node is specified. */
+ find_usable_zone_for_movable();
+
+ /*
+ * If movable_node is specified, ignore kernelcore and movablecore
+ * options.
+ */
+ if (movable_node_is_enabled()) {
+ for (i = 0; i < type->cnt; i++) {
+ if (!memblock_is_hotpluggable(&type->regions[i]))
+ continue;
+
+ nid = type->regions[i].nid;
+
+ usable_startpfn = PFN_DOWN(type->regions[i].base);
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ goto out2;
+ }
/*
- * If movablecore was specified, calculate what size of
+ * If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
* and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5077,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
- find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
restart:
@@ -5131,6 +5167,7 @@ restart:
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
+out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
@@ -5857,7 +5894,7 @@ void *__init alloc_large_system_hash(const char *tablename,
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = alloc_bootmem_nopanic(size);
+ table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3bd0b8e6ab12..cfd162882c00 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)
table_size = sizeof(struct page_cgroup) * nr_pages;
- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ base = memblock_virt_alloc_try_nid_nopanic(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;
diff --git a/mm/percpu.c b/mm/percpu.c
index afbf352ae580..036cfe07050f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
- ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+ ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
if (!ptr)
return NULL;
ai = ptr;
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
- free_bootmem(__pa(ai), ai->__ai_size);
+ memblock_free_early(__pa(ai), ai->__ai_size);
}
/**
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
- group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
- group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
- unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
- unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
+ group_offsets = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_offsets[0]), 0);
+ group_sizes = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_sizes[0]), 0);
+ unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
+ unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+ pcpu_slot = memblock_virt_alloc(
+ pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
- schunk = alloc_bootmem(pcpu_chunk_struct_size);
+ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
schunk->base_addr = base_addr;
schunk->map = smap;
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = alloc_bootmem(pcpu_chunk_struct_size);
+ dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
- areas = alloc_bootmem_nopanic(areas_size);
+ areas = memblock_virt_alloc_nopanic(areas_size, 0);
if (!areas) {
rc = -ENOMEM;
goto out_free;
@@ -1712,7 +1715,7 @@ out_free_areas:
out_free:
pcpu_free_alloc_info(ai);
if (areas)
- free_bootmem(__pa(areas), areas_size);
+ memblock_free_early(__pa(areas), areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = alloc_bootmem(pages_size);
+ pages = memblock_virt_alloc(pages_size, 0);
/* allocate pages */
j = 0;
@@ -1823,7 +1826,7 @@ enomem:
free_fn(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
- free_bootmem(__pa(pages), pages_size);
+ memblock_free_early(__pa(pages), pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
- return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+ return memblock_virt_alloc_from_nopanic(
+ size, align, __pa(MAX_DMA_ADDRESS));
}
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
- free_bootmem(__pa(ptr), size);
+ memblock_free_early(__pa(ptr), size);
}
void __init setup_per_cpu_areas(void)
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void)
void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
- fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ fc = memblock_virt_alloc_from_nopanic(unit_size,
+ PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
diff --git a/mm/rmap.c b/mm/rmap.c
index 068522d8502a..962e2a1e13a0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
return 1;
}
+struct page_referenced_arg {
+ int mapcount;
+ int referenced;
+ unsigned long vm_flags;
+ struct mem_cgroup *memcg;
+};
/*
- * Subfunctions of page_referenced: page_referenced_one called
- * repeatedly from either page_referenced_anon or page_referenced_file.
+ * arg: page_referenced_arg will be passed
*/
int page_referenced_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, unsigned int *mapcount,
- unsigned long *vm_flags)
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
int referenced = 0;
+ struct page_referenced_arg *pra = arg;
if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
if (!pmd)
- goto out;
+ return SWAP_AGAIN;
if (vma->vm_flags & VM_LOCKED) {
spin_unlock(ptl);
- *mapcount = 0; /* break early from loop */
- *vm_flags |= VM_LOCKED;
- goto out;
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
}
/* go ahead even if the pmd is pmd_trans_splitting() */
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
*/
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
- goto out;
+ return SWAP_AGAIN;
if (vma->vm_flags & VM_LOCKED) {
pte_unmap_unlock(pte, ptl);
- *mapcount = 0; /* break early from loop */
- *vm_flags |= VM_LOCKED;
- goto out;
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
}
if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
- (*mapcount)--;
-
- if (referenced)
- *vm_flags |= vma->vm_flags;
-out:
- return referenced;
-}
-
-static int page_referenced_anon(struct page *page,
- struct mem_cgroup *memcg,
- unsigned long *vm_flags)
-{
- unsigned int mapcount;
- struct anon_vma *anon_vma;
- pgoff_t pgoff;
- struct anon_vma_chain *avc;
- int referenced = 0;
-
- anon_vma = page_lock_anon_vma_read(page);
- if (!anon_vma)
- return referenced;
-
- mapcount = page_mapcount(page);
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(page, vma);
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
- referenced += page_referenced_one(page, vma, address,
- &mapcount, vm_flags);
- if (!mapcount)
- break;
+ if (referenced) {
+ pra->referenced++;
+ pra->vm_flags |= vma->vm_flags;
}
- page_unlock_anon_vma_read(anon_vma);
- return referenced;
+ pra->mapcount--;
+ if (!pra->mapcount)
+ return SWAP_SUCCESS; /* To break the loop */
+
+ return SWAP_AGAIN;
}
-/**
- * page_referenced_file - referenced check for object-based rmap
- * @page: the page we're checking references on.
- * @memcg: target memory control group
- * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
- *
- * For an object-based mapped page, find all the places it is mapped and
- * check/clear the referenced flag. This is done by following the page->mapping
- * pointer, then walking the chain of vmas it holds. It returns the number
- * of references it found.
- *
- * This function is only called from page_referenced for object-based pages.
- */
-static int page_referenced_file(struct page *page,
- struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
{
- unsigned int mapcount;
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
- int referenced = 0;
-
- /*
- * The caller's checks on page->mapping and !PageAnon have made
- * sure that this is a file page: the check for page->mapping
- * excludes the case just before it gets set on an anon page.
- */
- BUG_ON(PageAnon(page));
-
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_mutex.
- */
- BUG_ON(!PageLocked(page));
-
- mutex_lock(&mapping->i_mmap_mutex);
+ struct page_referenced_arg *pra = arg;
+ struct mem_cgroup *memcg = pra->memcg;
- /*
- * i_mmap_mutex does not stabilize mapcount at all, but mapcount
- * is more likely to be accurate if we note it after spinning.
- */
- mapcount = page_mapcount(page);
-
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- unsigned long address = vma_address(page, vma);
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
- referenced += page_referenced_one(page, vma, address,
- &mapcount, vm_flags);
- if (!mapcount)
- break;
- }
+ if (!mm_match_cgroup(vma->vm_mm, memcg))
+ return true;
- mutex_unlock(&mapping->i_mmap_mutex);
- return referenced;
+ return false;
}
/**
@@ -851,41 +768,57 @@ int page_referenced(struct page *page,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
- int referenced = 0;
+ int ret;
int we_locked = 0;
+ struct page_referenced_arg pra = {
+ .mapcount = page_mapcount(page),
+ .memcg = memcg,
+ };
+ struct rmap_walk_control rwc = {
+ .rmap_one = page_referenced_one,
+ .arg = (void *)&pra,
+ .anon_lock = page_lock_anon_vma_read,
+ };
*vm_flags = 0;
- if (page_mapped(page) && page_rmapping(page)) {
- if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
- we_locked = trylock_page(page);
- if (!we_locked) {
- referenced++;
- goto out;
- }
- }
- if (unlikely(PageKsm(page)))
- referenced += page_referenced_ksm(page, memcg,
- vm_flags);
- else if (PageAnon(page))
- referenced += page_referenced_anon(page, memcg,
- vm_flags);
- else if (page->mapping)
- referenced += page_referenced_file(page, memcg,
- vm_flags);
- if (we_locked)
- unlock_page(page);
+ if (!page_mapped(page))
+ return 0;
+
+ if (!page_rmapping(page))
+ return 0;
+
+ if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+ we_locked = trylock_page(page);
+ if (!we_locked)
+ return 1;
}
-out:
- return referenced;
+
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (memcg) {
+ rwc.invalid_vma = invalid_page_referenced_vma;
+ }
+
+ ret = rmap_walk(page, &rwc);
+ *vm_flags = pra.vm_flags;
+
+ if (we_locked)
+ unlock_page(page);
+
+ return pra.referenced;
}
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
spinlock_t *ptl;
int ret = 0;
+ int *cleaned = arg;
pte = page_check_address(page, mm, address, &ptl, 1);
if (!pte)
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
- if (ret)
+ if (ret) {
mmu_notifier_invalidate_page(mm, address);
+ (*cleaned)++;
+ }
out:
- return ret;
+ return SWAP_AGAIN;
}
-static int page_mkclean_file(struct address_space *mapping, struct page *page)
+static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
- int ret = 0;
-
- BUG_ON(PageAnon(page));
+ if (vma->vm_flags & VM_SHARED)
+ return 0;
- mutex_lock(&mapping->i_mmap_mutex);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- if (vma->vm_flags & VM_SHARED) {
- unsigned long address = vma_address(page, vma);
- ret += page_mkclean_one(page, vma, address);
- }
- }
- mutex_unlock(&mapping->i_mmap_mutex);
- return ret;
+ return 1;
}
int page_mkclean(struct page *page)
{
- int ret = 0;
+ int cleaned = 0;
+ struct address_space *mapping;
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&cleaned,
+ .rmap_one = page_mkclean_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
BUG_ON(!PageLocked(page));
- if (page_mapped(page)) {
- struct address_space *mapping = page_mapping(page);
- if (mapping)
- ret = page_mkclean_file(mapping, page);
- }
+ if (!page_mapped(page))
+ return 0;
- return ret;
+ mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+
+ rmap_walk(page, &rwc);
+
+ return cleaned;
}
EXPORT_SYMBOL_GPL(page_mkclean);
@@ -1177,17 +1110,17 @@ out:
}
/*
- * Subfunctions of try_to_unmap: try_to_unmap_one called
- * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
+ * @arg: enum ttu_flags will be passed to this argument
*/
int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, enum ttu_flags flags)
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
+ enum ttu_flags flags = (enum ttu_flags)arg;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
-{
- int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
-
- if (!maybe_stack)
- return false;
-
- if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
- VM_STACK_INCOMPLETE_SETUP)
- return true;
-
- return false;
-}
-
-/**
- * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
- * rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the anon_vma struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * anonymous pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
-{
- struct anon_vma *anon_vma;
- pgoff_t pgoff;
- struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
-
- anon_vma = page_lock_anon_vma_read(page);
- if (!anon_vma)
- return ret;
-
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long address;
-
- /*
- * During exec, a temporary VMA is setup and later moved.
- * The VMA is moved under the anon_vma lock but not the
- * page tables leading to a race where migration cannot
- * find the migration ptes. Rather than increasing the
- * locking requirements of exec(), migration skips
- * temporary VMAs until after exec() completes.
- */
- if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
- is_vma_temporary_stack(vma))
- continue;
-
- address = vma_address(page, vma);
- ret = try_to_unmap_one(page, vma, address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page))
- break;
- }
-
- page_unlock_anon_vma_read(anon_vma);
- return ret;
-}
-
-/**
- * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * object-based pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+static int try_to_unmap_nonlinear(struct page *page,
+ struct address_space *mapping, struct vm_area_struct *vma)
{
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
unsigned long max_nl_size = 0;
unsigned int mapcount;
- if (PageHuge(page))
- pgoff = page->index << compound_order(page);
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
- mutex_lock(&mapping->i_mmap_mutex);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- unsigned long address = vma_address(page, vma);
- ret = try_to_unmap_one(page, vma, address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page))
- goto out;
- }
-
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto out;
-
- /*
- * We don't bother to try to find the munlocked page in nonlinears.
- * It's costly. Instead, later, page reclaim logic may call
- * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
- */
- if (TTU_ACTION(flags) == TTU_MUNLOCK)
- goto out;
-
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
}
if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
- ret = SWAP_FAIL;
- goto out;
+ return SWAP_FAIL;
}
/*
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
*/
mapcount = page_mapcount(page);
if (!mapcount)
- goto out;
+ return ret;
+
cond_resched();
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
max_nl_cursor = CLUSTER_SIZE;
do {
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.nonlinear) {
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
+
cursor = (unsigned long) vma->vm_private_data;
- while ( cursor < max_nl_cursor &&
+ while (cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
if (try_to_unmap_cluster(cursor, &mapcount,
vma, page) == SWAP_MLOCK)
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
- goto out;
+ return ret;
}
vma->vm_private_data = (void *) max_nl_cursor;
}
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
*/
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
vma->vm_private_data = NULL;
-out:
- mutex_unlock(&mapping->i_mmap_mutex);
+
return ret;
}
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
+{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
+static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
+{
+ return is_vma_temporary_stack(vma);
+}
+
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+};
+
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
@@ -1622,16 +1473,29 @@ out:
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)flags,
+ .done = page_not_mapped,
+ .file_nonlinear = try_to_unmap_nonlinear,
+ .anon_lock = page_lock_anon_vma_read,
+ };
- BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
- if (unlikely(PageKsm(page)))
- ret = try_to_unmap_ksm(page, flags);
- else if (PageAnon(page))
- ret = try_to_unmap_anon(page, flags);
- else
- ret = try_to_unmap_file(page, flags);
+ /*
+ * During exec, a temporary VMA is setup and later moved.
+ * The VMA is moved under the anon_vma lock but not the
+ * page tables leading to a race where migration cannot
+ * find the migration ptes. Rather than increasing the
+ * locking requirements of exec(), migration skips
+ * temporary VMAs until after exec() completes.
+ */
+ if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page))
+ rwc.invalid_vma = invalid_migration_vma;
+
+ ret = rmap_walk(page, &rwc);
+
if (ret != SWAP_MLOCK && !page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
*/
int try_to_munlock(struct page *page)
{
+ int ret;
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)TTU_MUNLOCK,
+ .done = page_not_mapped,
+ /*
+ * We don't bother to try to find the munlocked page in
+ * nonlinears. It's costly. Instead, later, page reclaim logic
+ * may call try_to_unmap() and recover PG_mlocked lazily.
+ */
+ .file_nonlinear = NULL,
+ .anon_lock = page_lock_anon_vma_read,
+
+ };
+
VM_BUG_ON(!PageLocked(page) || PageLRU(page));
- if (unlikely(PageKsm(page)))
- return try_to_unmap_ksm(page, TTU_MUNLOCK);
- else if (PageAnon(page))
- return try_to_unmap_anon(page, TTU_MUNLOCK);
- else
- return try_to_unmap_file(page, TTU_MUNLOCK);
+ ret = rmap_walk(page, &rwc);
+ return ret;
}
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
anon_vma_free(anon_vma);
}
-#ifdef CONFIG_MIGRATION
-/*
- * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
- * Called by migrate.c to remove migration ptes, but might be used more later.
- */
-static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
+
+ if (rwc->anon_lock)
+ return rwc->anon_lock(page);
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
*/
anon_vma = page_anon_vma(page);
if (!anon_vma)
- return ret;
+ return NULL;
+
anon_vma_lock_read(anon_vma);
+ return anon_vma;
+}
+
+/*
+ * rmap_walk_anon - do something to anonymous page using the object-based
+ * rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+{
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct anon_vma_chain *avc;
+ int ret = SWAP_AGAIN;
+
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ if (!anon_vma)
+ return ret;
+
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- ret = rmap_one(page, vma, address, arg);
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
if (ret != SWAP_AGAIN)
break;
+ if (rwc->done && rwc->done(page))
+ break;
}
anon_vma_unlock_read(anon_vma);
return ret;
}
-static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
{
struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page->index << compound_order(page);
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
+ /*
+ * The page lock not only makes sure that page->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the
+ * structure at mapping cannot be freed and reused yet,
+ * so we can safely take mapping->i_mmap_mutex.
+ */
+ VM_BUG_ON(!PageLocked(page));
+
if (!mapping)
return ret;
mutex_lock(&mapping->i_mmap_mutex);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- ret = rmap_one(page, vma, address, arg);
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
if (ret != SWAP_AGAIN)
- break;
+ goto done;
+ if (rwc->done && rwc->done(page))
+ goto done;
}
- /*
- * No nonlinear handling: being always shared, nonlinear vmas
- * never contain migration ptes. Decide what to do about this
- * limitation to linear when we need rmap_walk() on nonlinear.
- */
+
+ if (!rwc->file_nonlinear)
+ goto done;
+
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ goto done;
+
+ ret = rwc->file_nonlinear(page, mapping, vma);
+
+done:
mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
- VM_BUG_ON(!PageLocked(page));
-
if (unlikely(PageKsm(page)))
- return rmap_walk_ksm(page, rmap_one, arg);
+ return rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rmap_one, arg);
+ return rmap_walk_anon(page, rwc);
else
- return rmap_walk_file(page, rmap_one, arg);
+ return rmap_walk_file(page, rwc);
}
-#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_HUGETLB_PAGE
/*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3be757..4cba9c2783a1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
+ return memblock_virt_alloc_try_nid(size, align, goal,
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
}
static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (vmemmap_buf_start) {
/* need to free left buf */
- free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+ memblock_free_early(__pa(vmemmap_buf),
+ vmemmap_buf_end - vmemmap_buf);
vmemmap_buf = NULL;
vmemmap_buf_end = NULL;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0e9590..63c3ea5c119c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
else
section = kzalloc(array_size, GFP_KERNEL);
} else {
- section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+ section = memblock_virt_alloc_node(array_size, nid);
}
return section;
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
- SMP_CACHE_BYTES, goal, limit);
+ p = memblock_virt_alloc_try_nid_nopanic(size,
+ SMP_CACHE_BYTES, goal, limit,
+ nid);
if (!p && limit) {
limit = 0;
goto again;
@@ -331,7 +332,7 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- return alloc_bootmem_node_nopanic(pgdat, size);
+ return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
return map;
size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
- map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ map = memblock_virt_alloc_try_nid(size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
return map;
}
void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
}
size = PAGE_ALIGN(size);
- map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ map = memblock_virt_alloc_try_nid(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
@@ -545,7 +548,7 @@ void __init sparse_init(void)
* sparse_early_mem_map_alloc, so allocate usemap_map at first.
*/
size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
- usemap_map = alloc_bootmem(size);
+ usemap_map = memblock_virt_alloc(size, 0);
if (!usemap_map)
panic("can not allocate usemap_map\n");
alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +556,7 @@ void __init sparse_init(void)
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
- map_map = alloc_bootmem(size2);
+ map_map = memblock_virt_alloc(size2, 0);
if (!map_map)
panic("can not allocate map_map\n");
alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +586,9 @@ void __init sparse_init(void)
vmemmap_populate_print_last();
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- free_bootmem(__pa(map_map), size2);
+ memblock_free_early(__pa(map_map), size2);
#endif
- free_bootmem(__pa(usemap_map), size);
+ memblock_free_early(__pa(usemap_map), size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd03..d1100b619e61 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
-#include <linux/hugetlb.h>
#include "internal.h"
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page)
static void put_compound_page(struct page *page)
{
- if (unlikely(PageTail(page))) {
- /* __split_huge_page_refcount can run under us */
- struct page *page_head = compound_trans_head(page);
-
- if (likely(page != page_head &&
- get_page_unless_zero(page_head))) {
- unsigned long flags;
+ struct page *page_head;
+ if (likely(!PageTail(page))) {
+ if (put_page_testzero(page)) {
/*
- * THP can not break up slab pages so avoid taking
- * compound_lock(). Slab performs non-atomic bit ops
- * on page->flags for better performance. In particular
- * slab_unlock() in slub used to be a hot path. It is
- * still hot on arches that do not support
- * this_cpu_cmpxchg_double().
+ * By the time all refcounts have been released
+ * split_huge_page cannot run anymore from under us.
*/
- if (PageSlab(page_head) || PageHeadHuge(page_head)) {
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON(!PageHead(page_head));
- atomic_dec(&page->_mapcount);
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
- if (put_page_testzero(page_head))
- __put_compound_page(page_head);
- return;
- } else
- /*
- * __split_huge_page_refcount
- * run before us, "page" was a
- * THP tail. The split
- * page_head has been freed
- * and reallocated as slab or
- * hugetlbfs page of smaller
- * order (only possible if
- * reallocated as slab on
- * x86).
- */
- goto skip_lock;
- }
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
+ }
+ return;
+ }
+
+ /* __split_huge_page_refcount can run under us */
+ page_head = compound_trans_head(page);
+
+ /*
+ * THP can not break up slab pages so avoid taking
+ * compound_lock() and skip the tail page refcounting (in
+ * _mapcount) too. Slab performs non-atomic bit ops on
+ * page->flags for better performance. In particular
+ * slab_unlock() in slub used to be a hot path. It is still
+ * hot on arches that do not support
+ * this_cpu_cmpxchg_double().
+ *
+ * If "page" is part of a slab or hugetlbfs page it cannot be
+ * splitted and the head page cannot change from under us. And
+ * if "page" is part of a THP page under splitting, if the
+ * head page pointed by the THP tail isn't a THP head anymore,
+ * we'll find PageTail clear after smp_rmb() and we'll treat
+ * it as a single page.
+ */
+ if (!__compound_tail_refcounted(page_head)) {
+ /*
+ * If "page" is a THP tail, we must read the tail page
+ * flags after the head page flags. The
+ * split_huge_page side enforces write memory barriers
+ * between clearing PageTail and before the head page
+ * can be freed and reallocated.
+ */
+ smp_rmb();
+ if (likely(PageTail(page))) {
/*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
+ * __split_huge_page_refcount cannot race
+ * here.
*/
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
-skip_lock:
- if (put_page_testzero(page_head)) {
- /*
- * The head page may have been
- * freed and reallocated as a
- * compound page of smaller
- * order and then freed again.
- * All we know is that it
- * cannot have become: a THP
- * page, a compound page of
- * higher order, a tail page.
- * That is because we still
- * hold the refcount of the
- * split THP tail and
- * page_head was the THP head
- * before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
+ VM_BUG_ON(!PageHead(page_head));
+ VM_BUG_ON(page_mapcount(page) != 0);
+ if (put_page_testzero(page_head)) {
+ /*
+ * If this is the tail of a slab
+ * compound page, the tail pin must
+ * not be the last reference held on
+ * the page, because the PG_slab
+ * cannot be cleared before all tail
+ * pins (which skips the _mapcount
+ * tail refcounting) have been
+ * released. For hugetlbfs the tail
+ * pin may be the last reference on
+ * the page instead, because
+ * PageHeadHuge will not go away until
+ * the compound page enters the buddy
+ * allocator.
+ */
+ VM_BUG_ON(PageSlab(page_head));
+ __put_compound_page(page_head);
}
- VM_BUG_ON(page_head != page->first_page);
+ return;
+ } else
/*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on
- * the compound_lock.
+ * __split_huge_page_refcount run before us,
+ * "page" was a THP tail. The split page_head
+ * has been freed and reallocated as slab or
+ * hugetlbfs page of smaller order (only
+ * possible if reallocated as slab on x86).
*/
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON(page_mapcount(page) <= 0);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- compound_unlock_irqrestore(page_head, flags);
+ goto out_put_single;
+ }
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ unsigned long flags;
+
+ /*
+ * page_head wasn't a dangling pointer but it may not
+ * be a head page anymore by the time we obtain the
+ * lock. That is ok as long as it can't be freed from
+ * under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
if (put_page_testzero(page_head)) {
+ /*
+ * The head page may have been freed
+ * and reallocated as a compound page
+ * of smaller order and then freed
+ * again. All we know is that it
+ * cannot have become: a THP page, a
+ * compound page of higher order, a
+ * tail page. That is because we
+ * still hold the refcount of the
+ * split THP tail and page_head was
+ * the THP head before the split.
+ */
if (PageHead(page_head))
__put_compound_page(page_head);
else
__put_single_page(page_head);
}
- } else {
- /* page_head is a dangling pointer */
- VM_BUG_ON(PageTail(page));
- goto out_put_single;
+out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
}
- } else if (put_page_testzero(page)) {
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
+ VM_BUG_ON(page_head != page->first_page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero() now that
+ * __split_huge_page_refcount() is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON(page_mapcount(page) <= 0);
+ atomic_dec(&page->_mapcount);
+ VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
+ compound_unlock_irqrestore(page_head, flags);
+
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* page_head is a dangling pointer */
+ VM_BUG_ON(PageTail(page));
+ goto out_put_single;
}
}
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page)
* split_huge_page().
*/
unsigned long flags;
- bool got = false;
+ bool got;
struct page *page_head = compound_trans_head(page);
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /* Ref to put_compound_page() comment. */
- if (PageSlab(page_head) || PageHeadHuge(page_head)) {
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON(!PageHead(page_head));
- __get_page_tail_foll(page, false);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- put_page(page_head);
- return false;
- }
+ /* Ref to put_compound_page() comment. */
+ if (!__compound_tail_refcounted(page_head)) {
+ smp_rmb();
+ if (likely(PageTail(page))) {
+ /*
+ * This is a hugetlbfs page or a slab
+ * page. __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON(!PageHead(page_head));
+ __get_page_tail_foll(page, true);
+ return true;
+ } else {
+ /*
+ * __split_huge_page_refcount run
+ * before us, "page" was a THP
+ * tail. The split page_head has been
+ * freed and reallocated as slab or
+ * hugetlbfs page of smaller order
+ * (only possible if reallocated as
+ * slab on x86).
+ */
+ return false;
}
+ }
+ got = false;
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
/*
* page_head wasn't a dangling pointer but it
* may not be a head page anymore by the time
diff --git a/mm/util.c b/mm/util.c
index 808f375648e7..a24aa22f2473 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page)
return mapping;
}
+int overcommit_ratio_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_kbytes = 0;
+ return ret;
+}
+
+int overcommit_kbytes_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ sysctl_overcommit_ratio = 0;
+ return ret;
+}
+
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
unsigned long vm_commit_limit(void)
{
- return ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
+ unsigned long allowed;
+
+ if (sysctl_overcommit_kbytes)
+ allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
+ else
+ allowed = ((totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100);
+ allowed += total_swap_pages;
+
+ return allowed;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..e4f0db2a3eae 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the physical pfn it maps to.
*/
-struct page *vmalloc_to_page(const void *vmalloc_addr)
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
unsigned long addr = (unsigned long) vmalloc_addr;
- struct page *page = NULL;
+ unsigned long pfn = 0;
pgd_t *pgd = pgd_offset_k(addr);
/*
@@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
ptep = pte_offset_map(pmd, addr);
pte = *ptep;
if (pte_present(pte))
- page = pte_page(pte);
+ pfn = pte_pfn(pte);
pte_unmap(ptep);
}
}
}
- return page;
+ return pfn;
}
-EXPORT_SYMBOL(vmalloc_to_page);
+EXPORT_SYMBOL(vmalloc_to_pfn);
/*
- * Map a vmalloc()-space virtual address to the physical page frame number.
+ * Map a vmalloc()-space virtual address to the struct page.
*/
-unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+struct page *vmalloc_to_page(const void *vmalloc_addr)
{
- return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+ return pfn_to_page(vmalloc_to_pfn(vmalloc_addr));
}
-EXPORT_SYMBOL(vmalloc_to_pfn);
+EXPORT_SYMBOL(vmalloc_to_page);
/*** Global kva allocator ***/