summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-08 18:17:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-08 18:17:56 -0700
commitbf5f89463f5b3109a72ed13ca62b57e90213387d (patch)
treef9f288a341dd86efa996f7a08fb425eae34eb446 /mm
parent2d3e4866dea96b0506395b47bfefb234f2088dac (diff)
parent4d2b5bcab53f1c76a86279339561c9a36109a93b (diff)
downloadlinux-bf5f89463f5b3109a72ed13ca62b57e90213387d.tar.gz
linux-bf5f89463f5b3109a72ed13ca62b57e90213387d.tar.bz2
linux-bf5f89463f5b3109a72ed13ca62b57e90213387d.zip
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - the rest of MM - various misc things - procfs updates - lib/ updates - checkpatch updates - kdump/kexec updates - add kvmalloc helpers, use them - time helper updates for Y2038 issues. We're almost ready to remove current_fs_time() but that awaits a btrfs merge. - add tracepoints to DAX * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits) drivers/staging/ccree/ssi_hash.c: fix build with gcc-4.4.4 selftests/vm: add a test for virtual address range mapping dax: add tracepoint to dax_insert_mapping() dax: add tracepoint to dax_writeback_one() dax: add tracepoints to dax_writeback_mapping_range() dax: add tracepoints to dax_load_hole() dax: add tracepoints to dax_pfn_mkwrite() dax: add tracepoints to dax_iomap_pte_fault() mtd: nand: nandsim: convert to memalloc_noreclaim_*() treewide: convert PF_MEMALLOC manipulations to new helpers mm: introduce memalloc_noreclaim_{save,restore} mm: prevent potential recursive reclaim due to clearing PF_MEMALLOC mm/huge_memory.c: deposit a pgtable for DAX PMD faults when required mm/huge_memory.c: use zap_deposited_table() more time: delete CURRENT_TIME_SEC and CURRENT_TIME gfs2: replace CURRENT_TIME with current_time apparmorfs: replace CURRENT_TIME with current_time() lustre: replace CURRENT_TIME macro fs: ubifs: replace CURRENT_TIME_SEC with current_time fs: ufs: use ktime_get_real_ts64() for birthtime ...
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c83
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/frame_vector.c5
-rw-r--r--mm/huge_memory.c28
-rw-r--r--mm/internal.h12
-rw-r--r--mm/kasan/kasan.c2
-rw-r--r--mm/nommu.c8
-rw-r--r--mm/page_alloc.c162
-rw-r--r--mm/page_isolation.c5
-rw-r--r--mm/swap_slots.c19
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c10
-rw-r--r--mm/util.c57
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmscan.c17
15 files changed, 310 insertions, 139 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 09c5282ebdd2..613c59e928cb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,11 +89,6 @@ static void map_pages(struct list_head *list)
list_splice(&tmp_list, list);
}
-static inline bool migrate_async_suitable(int migratetype)
-{
- return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
-}
-
#ifdef CONFIG_COMPACTION
int PageMovable(struct page *page)
@@ -988,6 +983,22 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
+static bool suitable_migration_source(struct compact_control *cc,
+ struct page *page)
+{
+ int block_mt;
+
+ if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
+ return true;
+
+ block_mt = get_pageblock_migratetype(page);
+
+ if (cc->migratetype == MIGRATE_MOVABLE)
+ return is_migrate_movable(block_mt);
+ else
+ return block_mt == cc->migratetype;
+}
+
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct compact_control *cc,
struct page *page)
@@ -1007,7 +1018,7 @@ static bool suitable_migration_target(struct compact_control *cc,
return true;
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ if (is_migrate_movable(get_pageblock_migratetype(page)))
return true;
/* Otherwise skip the block */
@@ -1242,8 +1253,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* Async compaction is optimistic to see if the minimum amount
* of work satisfies the allocation.
*/
- if (cc->mode == MIGRATE_ASYNC &&
- !migrate_async_suitable(get_pageblock_migratetype(page)))
+ if (!suitable_migration_source(cc, page))
continue;
/* Perform the isolation */
@@ -1276,11 +1286,11 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
-static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
- const int migratetype)
+static enum compact_result __compact_finished(struct zone *zone,
+ struct compact_control *cc)
{
unsigned int order;
- unsigned long watermark;
+ const int migratetype = cc->migratetype;
if (cc->contended || fatal_signal_pending(current))
return COMPACT_CONTENDED;
@@ -1308,12 +1318,16 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
- /* Compaction run is not finished if the watermark is not met */
- watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
-
- if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
- cc->alloc_flags))
- return COMPACT_CONTINUE;
+ if (cc->finishing_block) {
+ /*
+ * We have finished the pageblock, but better check again that
+ * we really succeeded.
+ */
+ if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ cc->finishing_block = false;
+ else
+ return COMPACT_CONTINUE;
+ }
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
@@ -1335,20 +1349,40 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1)
- return COMPACT_SUCCESS;
+ true, &can_steal) != -1) {
+
+ /* movable pages are OK in any pageblock */
+ if (migratetype == MIGRATE_MOVABLE)
+ return COMPACT_SUCCESS;
+
+ /*
+ * We are stealing for a non-movable allocation. Make
+ * sure we finish compacting the current pageblock
+ * first so it is as free as possible and we won't
+ * have to steal another one soon. This only applies
+ * to sync compaction, as async compaction operates
+ * on pageblocks of the same migratetype.
+ */
+ if (cc->mode == MIGRATE_ASYNC ||
+ IS_ALIGNED(cc->migrate_pfn,
+ pageblock_nr_pages)) {
+ return COMPACT_SUCCESS;
+ }
+
+ cc->finishing_block = true;
+ return COMPACT_CONTINUE;
+ }
}
return COMPACT_NO_SUITABLE_PAGE;
}
static enum compact_result compact_finished(struct zone *zone,
- struct compact_control *cc,
- const int migratetype)
+ struct compact_control *cc)
{
int ret;
- ret = __compact_finished(zone, cc, migratetype);
+ ret = __compact_finished(zone, cc);
trace_mm_compaction_finished(zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
@@ -1481,9 +1515,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
enum compact_result ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
- const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
+ cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
@@ -1533,8 +1567,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
migrate_prep_local();
- while ((ret = compact_finished(zone, cc, migratetype)) ==
- COMPACT_CONTINUE) {
+ while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
int err;
switch (isolate_migratepages(zone, cc)) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 681da61080bc..b7b973b47d8d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2791,12 +2791,6 @@ ssize_t generic_perform_write(struct file *file,
ssize_t written = 0;
unsigned int flags = 0;
- /*
- * Copies from kernel address space cannot fail (NFSD is a big user).
- */
- if (!iter_is_iovec(i))
- flags |= AOP_FLAG_UNINTERRUPTIBLE;
-
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index db77dcb38afd..72ebec18629c 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -200,10 +200,7 @@ struct frame_vector *frame_vector_create(unsigned int nr_frames)
* Avoid higher order allocations, use vmalloc instead. It should
* be rare anyway.
*/
- if (size <= PAGE_SIZE)
- vec = kmalloc(size, GFP_KERNEL);
- else
- vec = vmalloc(size);
+ vec = kvmalloc(size, GFP_KERNEL);
if (!vec)
return NULL;
vec->nr_allocated = nr_frames;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b787c4cfda0e..a84909cf20d3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -715,7 +715,8 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
+ pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
+ pgtable_t pgtable)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
@@ -729,6 +730,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
entry = pmd_mkyoung(pmd_mkdirty(entry));
entry = maybe_pmd_mkwrite(entry, vma);
}
+
+ if (pgtable) {
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ atomic_long_inc(&mm->nr_ptes);
+ }
+
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
spin_unlock(ptl);
@@ -738,6 +745,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, pfn_t pfn, bool write)
{
pgprot_t pgprot = vma->vm_page_prot;
+ pgtable_t pgtable = NULL;
/*
* If we had pmd_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -752,9 +760,15 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm, addr);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+ insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -1611,12 +1625,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_dax(vma)) {
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
- pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
- atomic_long_dec(&tlb->mm->nr_ptes);
+ zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
@@ -1625,10 +1640,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page)) {
- pgtable_t pgtable;
- pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
- pte_free(tlb->mm, pgtable);
- atomic_long_dec(&tlb->mm->nr_ptes);
+ zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
if (arch_needs_pgtable_deposit())
diff --git a/mm/internal.h b/mm/internal.h
index 04d08ef91224..0e4f558412fb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -183,6 +183,7 @@ extern int user_min_free_kbytes;
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
+ struct zone *zone;
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long total_migrate_scanned;
@@ -190,17 +191,18 @@ struct compact_control {
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+ const gfp_t gfp_mask; /* gfp mask of a direct compactor */
+ int order; /* order a direct compactor needs */
+ int migratetype; /* migratetype of direct compactor */
+ const unsigned int alloc_flags; /* alloc flags of a direct compactor */
+ const int classzone_idx; /* zone index of a direct compactor */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
bool whole_zone; /* Whole zone should/has been scanned */
- int order; /* order a direct compactor needs */
- const gfp_t gfp_mask; /* gfp mask of a direct compactor */
- const unsigned int alloc_flags; /* alloc flags of a direct compactor */
- const int classzone_idx; /* zone index of a direct compactor */
- struct zone *zone;
bool contended; /* Signal lock or sched contention */
+ bool finishing_block; /* Finishing current pageblock */
};
unsigned long
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 9348d27088c1..b10da59cf765 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -691,7 +691,7 @@ int kasan_module_alloc(void *addr, size_t size)
ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
shadow_start + shadow_size,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ GFP_KERNEL | __GFP_ZERO,
PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
__builtin_return_address(0));
diff --git a/mm/nommu.c b/mm/nommu.c
index 2d131b97a851..fc184f597d59 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -237,12 +237,16 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
+void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+{
+ return __vmalloc(size, flags, PAGE_KERNEL);
+}
+
void *vmalloc_user(unsigned long size)
{
void *ret;
- ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
if (ret) {
struct vm_area_struct *vma;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c25de46c58f..f9e450c6b6e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1832,9 +1832,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
- int migratetype)
+ int migratetype, int *num_movable)
{
struct page *page;
unsigned int order;
@@ -1851,6 +1851,9 @@ int move_freepages(struct zone *zone,
VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif
+ if (num_movable)
+ *num_movable = 0;
+
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -1861,6 +1864,15 @@ int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
if (!PageBuddy(page)) {
+ /*
+ * We assume that pages that could be isolated for
+ * migration are movable. But we don't actually try
+ * isolating, as that would be expensive.
+ */
+ if (num_movable &&
+ (PageLRU(page) || __PageMovable(page)))
+ (*num_movable)++;
+
page++;
continue;
}
@@ -1876,7 +1888,7 @@ int move_freepages(struct zone *zone,
}
int move_freepages_block(struct zone *zone, struct page *page,
- int migratetype)
+ int migratetype, int *num_movable)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
@@ -1893,7 +1905,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
if (!zone_spans_pfn(zone, end_pfn))
return 0;
- return move_freepages(zone, start_page, end_page, migratetype);
+ return move_freepages(zone, start_page, end_page, migratetype,
+ num_movable);
}
static void change_pageblock_range(struct page *pageblock_page,
@@ -1943,28 +1956,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
- * pageblock and check whether half of pages are moved or not. If half of
- * pages are moved, we can change migratetype of pageblock and permanently
- * use it's pages as requested migratetype in the future.
+ * pageblock to our migratetype and determine how many already-allocated pages
+ * are there in the pageblock with a compatible migratetype. If at least half
+ * of pages are free or compatible, we can change migratetype of the pageblock
+ * itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
- int start_type)
+ int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
- int pages;
+ struct free_area *area;
+ int free_pages, movable_pages, alike_pages;
+ int old_block_type;
+
+ old_block_type = get_pageblock_migratetype(page);
+
+ /*
+ * This can happen due to races and we want to prevent broken
+ * highatomic accounting.
+ */
+ if (is_migrate_highatomic(old_block_type))
+ goto single_page;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
- return;
+ goto single_page;
+ }
+
+ /* We are not allowed to try stealing from the whole block */
+ if (!whole_block)
+ goto single_page;
+
+ free_pages = move_freepages_block(zone, page, start_type,
+ &movable_pages);
+ /*
+ * Determine how many pages are compatible with our allocation.
+ * For movable allocation, it's the number of movable pages which
+ * we just obtained. For other types it's a bit more tricky.
+ */
+ if (start_type == MIGRATE_MOVABLE) {
+ alike_pages = movable_pages;
+ } else {
+ /*
+ * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
+ * to MOVABLE pageblock, consider all non-movable pages as
+ * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
+ * vice versa, be conservative since we can't distinguish the
+ * exact migratetype of non-movable pages.
+ */
+ if (old_block_type == MIGRATE_MOVABLE)
+ alike_pages = pageblock_nr_pages
+ - (free_pages + movable_pages);
+ else
+ alike_pages = 0;
}
- pages = move_freepages_block(zone, page, start_type);
+ /* moving whole block can fail due to zone boundary conditions */
+ if (!free_pages)
+ goto single_page;
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
+ /*
+ * If a sufficient number of pages in the block are either free or of
+ * comparable migratability as our allocation, claim the whole block.
+ */
+ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
+
+ return;
+
+single_page:
+ area = &zone->free_area[current_order];
+ list_move(&page->lru, &area->free_list[start_type]);
}
/*
@@ -2034,7 +2098,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
&& !is_migrate_cma(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
- move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+ move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
}
out_unlock:
@@ -2111,7 +2175,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
- ret = move_freepages_block(zone, page, ac->migratetype);
+ ret = move_freepages_block(zone, page, ac->migratetype,
+ NULL);
if (ret) {
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
@@ -2123,8 +2188,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
return false;
}
-/* Remove an element from the buddy allocator from the fallback list */
-static inline struct page *
+/*
+ * Try finding a free buddy page on the fallback list and put it on the free
+ * list of requested migratetype, possibly along with other pages from the same
+ * block, depending on fragmentation avoidance heuristics. Returns true if
+ * fallback was found so that __rmqueue_smallest() can grab it.
+ */
+static inline bool
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
struct free_area *area;
@@ -2145,32 +2215,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- if (can_steal && !is_migrate_highatomic_page(page))
- steal_suitable_fallback(zone, page, start_migratetype);
-
- /* Remove the page from the freelists */
- area->nr_free--;
- list_del(&page->lru);
- rmv_page_order(page);
- expand(zone, page, order, current_order, area,
- start_migratetype);
- /*
- * The pcppage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * find_suitable_fallback(). This is OK as long as it does not
- * differ for MIGRATE_CMA pageblocks. Those can be used as
- * fallback only via special __rmqueue_cma_fallback() function
- */
- set_pcppage_migratetype(page, start_migratetype);
+ steal_suitable_fallback(zone, page, start_migratetype,
+ can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
- return page;
+ return true;
}
- return NULL;
+ return false;
}
/*
@@ -2182,13 +2237,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
{
struct page *page;
+retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (!page && __rmqueue_fallback(zone, order, migratetype))
+ goto retry;
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -3227,14 +3283,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
+ unsigned int noreclaim_flag;
if (!order)
return NULL;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio);
- current->flags &= ~PF_MEMALLOC;
+ memalloc_noreclaim_restore(noreclaim_flag);
if (*compact_result <= COMPACT_INACTIVE)
return NULL;
@@ -3381,12 +3438,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
{
struct reclaim_state reclaim_state;
int progress;
+ unsigned int noreclaim_flag;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3396,7 +3454,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- current->flags &= ~PF_MEMALLOC;
+ memalloc_noreclaim_restore(noreclaim_flag);
cond_resched();
@@ -3609,6 +3667,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
@@ -3676,12 +3735,17 @@ retry_cpuset:
/*
* For costly allocations, try direct compaction first, as it's likely
- * that we have enough base pages and don't need to reclaim. Don't try
- * that for allocations that are allowed to ignore watermarks, as the
- * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+ * that we have enough base pages and don't need to reclaim. For non-
+ * movable high-order allocations, do that as well, as compaction will
+ * try prevent permanent fragmentation by migrating from blocks of the
+ * same migratetype.
+ * Don't try this for allocations that are allowed to ignore
+ * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
- if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
- !gfp_pfmemalloc_allowed(gfp_mask)) {
+ if (can_direct_reclaim &&
+ (costly_order ||
+ (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
+ && !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
@@ -3693,7 +3757,7 @@ retry_cpuset:
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
*/
- if (gfp_mask & __GFP_NORETRY) {
+ if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If compaction is deferred for high-order allocations,
* it is because sync compaction recently failed. If
@@ -3774,7 +3838,7 @@ retry:
* Do not retry costly high order allocations unless they are
* __GFP_REPEAT
*/
- if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+ if (costly_order && !(gfp_mask & __GFP_REPEAT))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7927bbb54a4e..5092e4ef00c8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -66,7 +66,8 @@ out:
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
zone->nr_isolate_pageblock++;
- nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+ NULL);
__mod_zone_freepage_state(zone, -nr_pages, migratetype);
}
@@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* pageblock scanning for freepage moving.
*/
if (!isolated_page) {
- nr_pages = move_freepages_block(zone, page, migratetype);
+ nr_pages = move_freepages_block(zone, page, migratetype, NULL);
__mod_zone_freepage_state(zone, nr_pages, migratetype);
}
set_pageblock_migratetype(page, migratetype);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index aa1c415f4abd..58f6c78f1dad 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -31,6 +31,7 @@
#include <linux/cpumask.h>
#include <linux/vmalloc.h>
#include <linux/mutex.h>
+#include <linux/mm.h>
#ifdef CONFIG_SWAP
@@ -119,16 +120,18 @@ static int alloc_swap_slot_cache(unsigned int cpu)
/*
* Do allocation outside swap_slots_cache_mutex
- * as vzalloc could trigger reclaim and get_swap_page,
+ * as kvzalloc could trigger reclaim and get_swap_page,
* which can lock swap_slots_cache_mutex.
*/
- slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+ slots = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE,
+ GFP_KERNEL);
if (!slots)
return -ENOMEM;
- slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+ slots_ret = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE,
+ GFP_KERNEL);
if (!slots_ret) {
- vfree(slots);
+ kvfree(slots);
return -ENOMEM;
}
@@ -152,9 +155,9 @@ static int alloc_swap_slot_cache(unsigned int cpu)
out:
mutex_unlock(&swap_slots_cache_mutex);
if (slots)
- vfree(slots);
+ kvfree(slots);
if (slots_ret)
- vfree(slots_ret);
+ kvfree(slots_ret);
return 0;
}
@@ -171,7 +174,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
cache->cur = 0;
cache->nr = 0;
if (free_slots && cache->slots) {
- vfree(cache->slots);
+ kvfree(cache->slots);
cache->slots = NULL;
}
mutex_unlock(&cache->alloc_lock);
@@ -186,7 +189,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
}
spin_unlock_irq(&cache->free_lock);
if (slots)
- vfree(slots);
+ kvfree(slots);
}
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7bfb9bd1ca21..539b8885e3d1 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -523,7 +523,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
unsigned int i, nr;
nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
- spaces = vzalloc(sizeof(struct address_space) * nr);
+ spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL);
if (!spaces)
return -ENOMEM;
for (i = 0; i < nr; i++) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b86b2aca3fb9..4f6cba1b6632 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2270,8 +2270,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
vfree(swap_map);
- vfree(cluster_info);
- vfree(frontswap_map);
+ kvfree(cluster_info);
+ kvfree(frontswap_map);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
exit_swap_address_space(p->type);
@@ -2794,7 +2794,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info));
+ cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info),
+ GFP_KERNEL);
if (!cluster_info) {
error = -ENOMEM;
goto bad_swap;
@@ -2827,7 +2828,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/* frontswap enabled? set up bit-per-page map for frontswap */
if (IS_ENABLED(CONFIG_FRONTSWAP))
- frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
+ frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long),
+ GFP_KERNEL);
if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
/*
diff --git a/mm/util.c b/mm/util.c
index 656dc5e37a87..718154debc87 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -329,6 +329,63 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
+/**
+ * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT
+ * is supported only for large (>32kB) allocations, and it should be used only if
+ * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
+ */
+void *kvmalloc_node(size_t size, gfp_t flags, int node)
+{
+ gfp_t kmalloc_flags = flags;
+ void *ret;
+
+ /*
+ * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
+ * so the given set of flags has to be compatible.
+ */
+ WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
+
+ /*
+ * Make sure that larger requests are not too disruptive - no OOM
+ * killer and no allocation failure warnings as we have a fallback
+ */
+ if (size > PAGE_SIZE) {
+ kmalloc_flags |= __GFP_NOWARN;
+
+ /*
+ * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly
+ * requests because there is no other way to tell the allocator
+ * that we want to fail rather than retry endlessly.
+ */
+ if (!(kmalloc_flags & __GFP_REPEAT) ||
+ (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ kmalloc_flags |= __GFP_NORETRY;
+ }
+
+ ret = kmalloc_node(size, kmalloc_flags, node);
+
+ /*
+ * It doesn't really make sense to fallback to vmalloc for sub page
+ * requests
+ */
+ if (ret || size <= PAGE_SIZE)
+ return ret;
+
+ return __vmalloc_node_flags(size, node, flags);
+}
+EXPORT_SYMBOL(kvmalloc_node);
+
void kvfree(const void *addr)
{
if (is_vmalloc_addr(addr))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b52aeed3f58e..1dda6d8a200a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1649,16 +1649,13 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+ const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
@@ -1786,8 +1783,15 @@ fail:
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
+ *
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT
+ * and __GFP_NOFAIL are not supported
+ *
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted
+ * with mm people.
+ *
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller)
{
@@ -1802,13 +1806,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
-static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
-}
-
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
@@ -1821,7 +1818,7 @@ static inline void *__vmalloc_node_flags(unsigned long size,
void *vmalloc(unsigned long size)
{
return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_HIGHMEM);
+ GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc);
@@ -1838,7 +1835,7 @@ EXPORT_SYMBOL(vmalloc);
void *vzalloc(unsigned long size)
{
return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc);
@@ -1855,7 +1852,7 @@ void *vmalloc_user(unsigned long size)
void *ret;
ret = __vmalloc_node(size, SHMLBA,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ GFP_KERNEL | __GFP_ZERO,
PAGE_KERNEL, NUMA_NO_NODE,
__builtin_return_address(0));
if (ret) {
@@ -1879,7 +1876,7 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+ return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
node, __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -1899,7 +1896,7 @@ EXPORT_SYMBOL(vmalloc_node);
void *vzalloc_node(unsigned long size, int node)
{
return __vmalloc_node_flags(size, node,
- GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc_node);
@@ -1921,7 +1918,7 @@ EXPORT_SYMBOL(vzalloc_node);
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+ return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
NUMA_NO_NODE, __builtin_return_address(0));
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4e7ed65842af..2f45c0520f43 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3036,6 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist;
unsigned long nr_reclaimed;
int nid;
+ unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
@@ -3062,9 +3063,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
sc.gfp_mask,
sc.reclaim_idx);
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
- current->flags &= ~PF_MEMALLOC;
+ memalloc_noreclaim_restore(noreclaim_flag);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -3589,8 +3590,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
+ unsigned int noreclaim_flag;
- p->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
lockdep_set_current_reclaim_state(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3599,7 +3601,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- p->flags &= ~PF_MEMALLOC;
+ memalloc_noreclaim_restore(noreclaim_flag);
return nr_reclaimed;
}
@@ -3764,6 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
struct task_struct *p = current;
struct reclaim_state reclaim_state;
int classzone_idx = gfp_zone(gfp_mask);
+ unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
@@ -3781,7 +3784,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
* and we also need to be able to write out pages for RECLAIM_WRITE
* and RECLAIM_UNMAP.
*/
- p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ noreclaim_flag = memalloc_noreclaim_save();
+ p->flags |= PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3797,7 +3801,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
}
p->reclaim_state = NULL;
- current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+ current->flags &= ~PF_SWAPWRITE;
+ memalloc_noreclaim_restore(noreclaim_flag);
lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
}