summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c156
-rw-r--r--mm/debug.c3
-rw-r--r--mm/gup.c228
-rw-r--r--mm/huge_memory.c106
-rw-r--r--mm/hugetlb.c158
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h22
-rw-r--r--mm/memcontrol.c702
-rw-r--r--mm/memory.c15
-rw-r--r--mm/mempolicy.c277
-rw-r--r--mm/migrate.c5
-rw-r--r--mm/mincore.c166
-rw-r--r--mm/mmap.c7
-rw-r--r--mm/mmzone.c4
-rw-r--r--mm/nommu.c37
-rw-r--r--mm/oom_kill.c169
-rw-r--r--mm/page-writeback.c17
-rw-r--r--mm/page_alloc.c432
-rw-r--r--mm/page_counter.c7
-rw-r--r--mm/page_owner.c26
-rw-r--r--mm/pagewalk.c238
-rw-r--r--mm/process_vm_access.c7
-rw-r--r--mm/rmap.c12
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/util.c10
-rw-r--r--mm/vmscan.c32
-rw-r--r--mm/vmstat.c6
28 files changed, 1738 insertions, 1110 deletions
diff --git a/mm/cma.c b/mm/cma.c
index a85ae28709a3..75016fd1de90 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
cma->order_per_bit = order_per_bit;
*res_cma = cma;
cma_area_count++;
+ totalcma_pages += (size / PAGE_SIZE);
return 0;
}
@@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base,
if (ret)
goto err;
- totalcma_pages += (size / PAGE_SIZE);
pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
&base);
return 0;
diff --git a/mm/compaction.c b/mm/compaction.c
index 546e571e9d60..b68736c8a1ce 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -34,6 +34,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#ifdef CONFIG_TRACEPOINTS
+static const char *const compaction_status_string[] = {
+ "deferred",
+ "skipped",
+ "continue",
+ "partial",
+ "complete",
+ "no_suitable_page",
+ "not_suitable_zone",
+};
+#endif
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
@@ -113,6 +124,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
}
#ifdef CONFIG_COMPACTION
+
+/* Do not skip compaction more than 64 times */
+#define COMPACT_MAX_DEFER_SHIFT 6
+
+/*
+ * Compaction is deferred when compaction fails to result in a page
+ * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
+ */
+void defer_compaction(struct zone *zone, int order)
+{
+ zone->compact_considered = 0;
+ zone->compact_defer_shift++;
+
+ if (order < zone->compact_order_failed)
+ zone->compact_order_failed = order;
+
+ if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
+ zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
+
+ trace_mm_compaction_defer_compaction(zone, order);
+}
+
+/* Returns true if compaction should be skipped this time */
+bool compaction_deferred(struct zone *zone, int order)
+{
+ unsigned long defer_limit = 1UL << zone->compact_defer_shift;
+
+ if (order < zone->compact_order_failed)
+ return false;
+
+ /* Avoid possible overflow */
+ if (++zone->compact_considered > defer_limit)
+ zone->compact_considered = defer_limit;
+
+ if (zone->compact_considered >= defer_limit)
+ return false;
+
+ trace_mm_compaction_deferred(zone, order);
+
+ return true;
+}
+
+/*
+ * Update defer tracking counters after successful compaction of given order,
+ * which means an allocation either succeeded (alloc_success == true) or is
+ * expected to succeed.
+ */
+void compaction_defer_reset(struct zone *zone, int order,
+ bool alloc_success)
+{
+ if (alloc_success) {
+ zone->compact_considered = 0;
+ zone->compact_defer_shift = 0;
+ }
+ if (order >= zone->compact_order_failed)
+ zone->compact_order_failed = order + 1;
+
+ trace_mm_compaction_defer_reset(zone, order);
+}
+
+/* Returns true if restarting compaction after many failures */
+bool compaction_restarting(struct zone *zone, int order)
+{
+ if (order < zone->compact_order_failed)
+ return false;
+
+ return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
+ zone->compact_considered >= 1UL << zone->compact_defer_shift;
+}
+
/* Returns true if the pageblock should be scanned for pages to isolate. */
static inline bool isolation_suitable(struct compact_control *cc,
struct page *page)
@@ -421,11 +503,12 @@ isolate_fail:
}
+ trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
+ nr_scanned, total_isolated);
+
/* Record how far we have got within the block */
*start_pfn = blockpfn;
- trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
-
/*
* If strict isolation is requested by CMA then check that all the
* pages requested were isolated. If there were any failures, 0 is
@@ -581,6 +664,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
+ unsigned long start_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -741,7 +825,8 @@ isolate_success:
if (low_pfn == end_pfn)
update_pageblock_skip(cc, valid_page, nr_isolated, true);
- trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+ trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
+ nr_scanned, nr_isolated);
count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
if (nr_isolated)
@@ -1037,7 +1122,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
-static int compact_finished(struct zone *zone, struct compact_control *cc,
+static int __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
unsigned int order;
@@ -1092,7 +1177,20 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_PARTIAL;
}
- return COMPACT_CONTINUE;
+ return COMPACT_NO_SUITABLE_PAGE;
+}
+
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+ const int migratetype)
+{
+ int ret;
+
+ ret = __compact_finished(zone, cc, migratetype);
+ trace_mm_compaction_finished(zone, cc->order, ret);
+ if (ret == COMPACT_NO_SUITABLE_PAGE)
+ ret = COMPACT_CONTINUE;
+
+ return ret;
}
/*
@@ -1102,7 +1200,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
-unsigned long compaction_suitable(struct zone *zone, int order,
+static unsigned long __compaction_suitable(struct zone *zone, int order,
int alloc_flags, int classzone_idx)
{
int fragindex;
@@ -1146,11 +1244,24 @@ unsigned long compaction_suitable(struct zone *zone, int order,
*/
fragindex = fragmentation_index(zone, order);
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- return COMPACT_SKIPPED;
+ return COMPACT_NOT_SUITABLE_ZONE;
return COMPACT_CONTINUE;
}
+unsigned long compaction_suitable(struct zone *zone, int order,
+ int alloc_flags, int classzone_idx)
+{
+ unsigned long ret;
+
+ ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+ trace_mm_compaction_suitable(zone, order, ret);
+ if (ret == COMPACT_NOT_SUITABLE_ZONE)
+ ret = COMPACT_SKIPPED;
+
+ return ret;
+}
+
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
@@ -1197,7 +1308,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
- trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
+ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
+ cc->free_pfn, end_pfn, sync);
migrate_prep_local();
@@ -1299,7 +1411,8 @@ out:
zone->compact_cached_free_pfn = free_pfn;
}
- trace_mm_compaction_end(ret);
+ trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
+ cc->free_pfn, end_pfn, sync, ret);
return ret;
}
@@ -1335,22 +1448,20 @@ int sysctl_extfrag_threshold = 500;
/**
* try_to_compact_pages - Direct compact to satisfy a high-order allocation
- * @zonelist: The zonelist used for the current allocation
- * @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
- * @nodemask: The allowed nodes to allocate from
+ * @order: The order of the current allocation
+ * @alloc_flags: The allocation flags of the current allocation
+ * @ac: The context of current allocation
* @mode: The migration mode for async, sync light, or sync migration
* @contended: Return value that determines if compaction was aborted due to
* need_resched() or lock contention
*
* This is the main entry point for direct page compaction.
*/
-unsigned long try_to_compact_pages(struct zonelist *zonelist,
- int order, gfp_t gfp_mask, nodemask_t *nodemask,
- enum migrate_mode mode, int *contended,
- int alloc_flags, int classzone_idx)
+unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+ int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended)
{
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
@@ -1364,9 +1475,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
if (!order || !may_enter_fs || !may_perform_io)
return COMPACT_SKIPPED;
+ trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
+
/* Compact each zone in the list */
- for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
- nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
int status;
int zone_contended;
@@ -1374,7 +1487,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
continue;
status = compact_zone_order(zone, order, gfp_mask, mode,
- &zone_contended, alloc_flags, classzone_idx);
+ &zone_contended, alloc_flags,
+ ac->classzone_idx);
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
@@ -1384,7 +1498,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
- classzone_idx, alloc_flags)) {
+ ac->classzone_idx, alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
diff --git a/mm/debug.c b/mm/debug.c
index d69cb5a7ba9a..3eb3ac2fcee7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
"get_unmapped_area %p\n"
#endif
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
- "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
+ "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+ mm_nr_pmds((struct mm_struct *)mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/gup.c b/mm/gup.c
index 12bc2bc33da7..c2da1163986a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pud_none(*pud))
return no_page_table(vma, flags);
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- if (flags & FOLL_GET)
- return NULL;
- page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
- return page;
+ page = follow_huge_pud(mm, address, pud, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
}
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pmd_none(*pmd))
return no_page_table(vma, flags);
if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
- if (flags & FOLL_GET) {
- /*
- * Refcount on tail pages are not well-defined and
- * shouldn't be taken. The caller should handle a NULL
- * return when trying to follow tail pages.
- */
- if (PageHead(page))
- get_page(page);
- else
- page = NULL;
- }
- return page;
+ page = follow_huge_pmd(mm, address, pmd, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
}
if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
return no_page_table(vma, flags);
@@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return 0;
}
+static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ int write, int force,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ int *locked, bool notify_drop,
+ unsigned int flags)
+{
+ long ret, pages_done;
+ bool lock_dropped;
+
+ if (locked) {
+ /* if VM_FAULT_RETRY can be returned, vmas become invalid */
+ BUG_ON(vmas);
+ /* check caller initialized locked */
+ BUG_ON(*locked != 1);
+ }
+
+ if (pages)
+ flags |= FOLL_GET;
+ if (write)
+ flags |= FOLL_WRITE;
+ if (force)
+ flags |= FOLL_FORCE;
+
+ pages_done = 0;
+ lock_dropped = false;
+ for (;;) {
+ ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+ vmas, locked);
+ if (!locked)
+ /* VM_FAULT_RETRY couldn't trigger, bypass */
+ return ret;
+
+ /* VM_FAULT_RETRY cannot return errors */
+ if (!*locked) {
+ BUG_ON(ret < 0);
+ BUG_ON(ret >= nr_pages);
+ }
+
+ if (!pages)
+ /* If it's a prefault don't insist harder */
+ return ret;
+
+ if (ret > 0) {
+ nr_pages -= ret;
+ pages_done += ret;
+ if (!nr_pages)
+ break;
+ }
+ if (*locked) {
+ /* VM_FAULT_RETRY didn't trigger */
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+ /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
+ pages += ret;
+ start += ret << PAGE_SHIFT;
+
+ /*
+ * Repeat on the address that fired VM_FAULT_RETRY
+ * without FAULT_FLAG_ALLOW_RETRY but with
+ * FAULT_FLAG_TRIED.
+ */
+ *locked = 1;
+ lock_dropped = true;
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+ pages, NULL, NULL);
+ if (ret != 1) {
+ BUG_ON(ret > 1);
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+ nr_pages--;
+ pages_done++;
+ if (!nr_pages)
+ break;
+ pages++;
+ start += PAGE_SIZE;
+ }
+ if (notify_drop && lock_dropped && *locked) {
+ /*
+ * We must let the caller know we temporarily dropped the lock
+ * and so the critical section protected by it was lost.
+ */
+ up_read(&mm->mmap_sem);
+ *locked = 0;
+ }
+ return pages_done;
+}
+
+/*
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
+ * get_user_pages_locked() is suitable to replace the form:
+ *
+ * down_read(&mm->mmap_sem);
+ * do_something()
+ * get_user_pages(tsk, mm, ..., pages, NULL);
+ * up_read(&mm->mmap_sem);
+ *
+ * to:
+ *
+ * int locked = 1;
+ * down_read(&mm->mmap_sem);
+ * do_something()
+ * get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ * if (locked)
+ * up_read(&mm->mmap_sem);
+ */
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
+{
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, NULL, locked, true, FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+/*
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
+ * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ *
+ * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
+ * caller if required (just like with __get_user_pages). "FOLL_GET",
+ * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
+ * according to the parameters "pages", "write", "force"
+ * respectively.
+ */
+__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ unsigned int gup_flags)
+{
+ long ret;
+ int locked = 1;
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, NULL, &locked, false, gup_flags);
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+/*
+ * get_user_pages_unlocked() is suitable to replace the form:
+ *
+ * down_read(&mm->mmap_sem);
+ * get_user_pages(tsk, mm, ..., pages, NULL);
+ * up_read(&mm->mmap_sem);
+ *
+ * with:
+ *
+ * get_user_pages_unlocked(tsk, mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead, if the two parameters
+ * "tsk" and "mm" are respectively equal to current and current->mm,
+ * or if "force" shall be set to 1 (get_user_pages_fast misses the
+ * "force" parameter).
+ */
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+ force, pages, FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
/*
* get_user_pages() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
@@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
* use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages, int write,
int force, struct page **pages, struct vm_area_struct **vmas)
{
- int flags = FOLL_TOUCH;
-
- if (pages)
- flags |= FOLL_GET;
- if (write)
- flags |= FOLL_WRITE;
- if (force)
- flags |= FOLL_FORCE;
-
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, vmas, NULL, false, FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- nr_pages - nr, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ nr_pages - nr, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875f2b8c..cb7be110cad3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
}
static atomic_t huge_zero_refcount;
-static struct page *huge_zero_page __read_mostly;
-
-static inline bool is_huge_zero_page(struct page *page)
-{
- return ACCESS_ONCE(huge_zero_page) == page;
-}
+struct page *huge_zero_page __read_mostly;
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
}
-static inline struct page *alloc_hugepage_vma(int defrag,
- struct vm_area_struct *vma,
- unsigned long haddr, int nd,
- gfp_t extra_gfp)
-{
- return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
- HPAGE_PMD_ORDER, vma, haddr, nd);
-}
-
/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
{
+ gfp_t gfp;
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
return 0;
}
- page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
+ gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
- !transparent_hugepage_debug_cow())
- new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
- else
+ !transparent_hugepage_debug_cow()) {
+ gfp_t gfp;
+
+ gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+ } else
new_page = NULL;
if (unlikely(!new_page)) {
@@ -1423,26 +1412,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return ret;
}
-int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- spinlock_t *ptl;
- int ret = 0;
-
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- /*
- * All logical pages in the range are present
- * if backed by a huge page.
- */
- spin_unlock(ptl);
- memset(vec, 1, (end - addr) >> PAGE_SHIFT);
- ret = 1;
- }
-
- return ret;
-}
-
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
@@ -2148,7 +2117,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{
struct page *page;
pte_t *_pte;
- int referenced = 0, none = 0;
+ int none = 0;
+ bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
@@ -2158,7 +2128,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
else
goto out;
}
- if (!pte_present(pteval) || !pte_write(pteval))
+ if (!pte_present(pteval))
goto out;
page = vm_normal_page(vma, address, pteval);
if (unlikely(!page))
@@ -2168,9 +2138,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- /* cannot use mapcount: can't collapse if there's a gup pin */
- if (page_count(page) != 1)
- goto out;
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
@@ -2179,6 +2146,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (!trylock_page(page))
goto out;
+
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ if (pte_write(pteval)) {
+ writable = true;
+ } else {
+ if (PageSwapCache(page) && !reuse_swap_page(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ /*
+ * Page is not in the swap cache. It can be collapsed
+ * into a THP.
+ */
+ }
+
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
@@ -2195,9 +2185,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* If there is no mapped pte young don't collapse the page */
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
- referenced = 1;
+ referenced = true;
}
- if (likely(referenced))
+ if (likely(referenced && writable))
return 1;
out:
release_pte_pages(pte, _pte);
@@ -2550,11 +2540,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, referenced = 0, none = 0;
+ int ret = 0, none = 0;
struct page *page;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE;
+ bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2573,8 +2564,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
else
goto out_unmap;
}
- if (!pte_present(pteval) || !pte_write(pteval))
+ if (!pte_present(pteval))
goto out_unmap;
+ if (pte_write(pteval))
+ writable = true;
+
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page))
goto out_unmap;
@@ -2591,14 +2585,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON_PAGE(PageCompound(page), page);
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
- /* cannot use mapcount: can't collapse if there's a gup pin */
- if (page_count(page) != 1)
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page))
goto out_unmap;
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
- referenced = 1;
+ referenced = true;
}
- if (referenced)
+ if (referenced && writable)
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index be0e5d0db5ec..0a9ac6c26832 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2657,9 +2657,10 @@ again:
goto unlock;
/*
- * HWPoisoned hugepage is already unmapped and dropped reference
+ * Migrating hugepage or HWPoisoned hugepage is already
+ * unmapped and its refcount is dropped, so just clear pte here.
*/
- if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ if (unlikely(!pte_present(pte))) {
huge_pte_clear(mm, address, ptep);
goto unlock;
}
@@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *pagecache_page = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
+ int need_wait_lock = 0;
address &= huge_page_mask(h);
@@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = 0;
/*
+ * entry could be a migration/hwpoison entry at this point, so this
+ * check prevents the kernel from going below assuming that we have
+ * a active hugepage in pagecache. This goto expects the 2nd page fault,
+ * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
+ * handle it.
+ */
+ if (!pte_present(entry))
+ goto out_mutex;
+
+ /*
* If we are going to COW the mapping later, we examine the pending
* reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
@@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}
+ ptl = huge_pte_lock(h, mm, ptep);
+
+ /* Check for a racing update before calling hugetlb_cow */
+ if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ goto out_ptl;
+
/*
* hugetlb_cow() requires page locks of pte_page(entry) and
* pagecache_page, so here we need take the former one
* when page != pagecache_page or !pagecache_page.
- * Note that locking order is always pagecache_page -> page,
- * so no worry about deadlock.
*/
page = pte_page(entry);
- get_page(page);
if (page != pagecache_page)
- lock_page(page);
-
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
- /* Check for a racing update before calling hugetlb_cow */
- if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
- goto out_ptl;
+ if (!trylock_page(page)) {
+ need_wait_lock = 1;
+ goto out_ptl;
+ }
+ get_page(page);
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep, entry,
pagecache_page, ptl);
- goto out_ptl;
+ goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
}
@@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (huge_ptep_set_access_flags(vma, address, ptep, entry,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, address, ptep);
-
+out_put_page:
+ if (page != pagecache_page)
+ unlock_page(page);
+ put_page(page);
out_ptl:
spin_unlock(ptl);
@@ -3229,12 +3245,17 @@ out_ptl:
unlock_page(pagecache_page);
put_page(pagecache_page);
}
- if (page != pagecache_page)
- unlock_page(page);
- put_page(page);
-
out_mutex:
mutex_unlock(&htlb_fault_mutex_table[hash]);
+ /*
+ * Generally it's safe to hold refcount during waiting page lock. But
+ * here we just wait to defer the next page fault to avoid busy loop and
+ * the page is not used after unlocked before returning from the current
+ * page fault. So we are safe from accessing freed page, even if we wait
+ * here without taking refcount.
+ */
+ if (need_wait_lock)
+ wait_on_page_locked(page);
return ret;
}
@@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
spin_unlock(ptl);
continue;
}
- if (!huge_pte_none(huge_ptep_get(ptep))) {
+ pte = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ spin_unlock(ptl);
+ continue;
+ }
+ if (unlikely(is_hugetlb_entry_migration(pte))) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (is_write_migration_entry(entry)) {
+ pte_t newpte;
+
+ make_migration_entry_read(&entry);
+ newpte = swp_entry_to_pte(entry);
+ set_huge_pte_at(mm, address, ptep, newpte);
+ pages++;
+ }
+ spin_unlock(ptl);
+ continue;
+ }
+ if (!huge_pte_none(pte)) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(huge_pte_modify(pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
@@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr);
if (spte) {
+ mm_inc_nr_pmds(mm);
get_page(virt_to_page(spte));
break;
}
@@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
spin_lock(ptl);
- if (pud_none(*pud))
+ if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
- else
+ } else {
put_page(virt_to_page(spte));
+ mm_inc_nr_pmds(mm);
+ }
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
pud_clear(pud);
put_page(virt_to_page(ptep));
+ mm_dec_nr_pmds(mm);
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
@@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
return (pte_t *) pmd;
}
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
- struct page *page;
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
- page = pte_page(*(pte_t *)pmd);
- if (page)
- page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
- return page;
+/*
+ * These functions are overwritable if your architecture needs its own
+ * behavior.
+ */
+struct page * __weak
+follow_huge_addr(struct mm_struct *mm, unsigned long address,
+ int write)
+{
+ return ERR_PTR(-EINVAL);
}
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
+struct page * __weak
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int flags)
{
- struct page *page;
-
- page = pte_page(*(pte_t *)pud);
- if (page)
- page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ struct page *page = NULL;
+ spinlock_t *ptl;
+retry:
+ ptl = pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
+ /*
+ * make sure that the address range covered by this pmd is not
+ * unmapped from other threads.
+ */
+ if (!pmd_huge(*pmd))
+ goto out;
+ if (pmd_present(*pmd)) {
+ page = pte_page(*(pte_t *)pmd) +
+ ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ if (flags & FOLL_GET)
+ get_page(page);
+ } else {
+ if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+ spin_unlock(ptl);
+ __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+out:
+ spin_unlock(ptl);
return page;
}
-#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-
-/* Can be overriden by architectures */
struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
+ pud_t *pud, int flags)
{
- BUG();
- return NULL;
-}
+ if (flags & FOLL_GET)
+ return NULL;
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+ return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+}
#ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 037e1c00a5b7..6e0057439a46 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return -EINVAL;
buf = strstrip(buf);
- ret = page_counter_memparse(buf, &nr_pages);
+ ret = page_counter_memparse(buf, "-1", &nr_pages);
if (ret)
return ret;
diff --git a/mm/internal.h b/mm/internal.h
index efad241f7014..c4d6c9b43491 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
*/
/*
+ * Structure for holding the mostly immutable allocation parameters passed
+ * between functions involved in allocations, including the alloc_pages*
+ * family of functions.
+ *
+ * nodemask, migratetype and high_zoneidx are initialized only once in
+ * __alloc_pages_nodemask() and then never change.
+ *
+ * zonelist, preferred_zone and classzone_idx are set first in
+ * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * in __alloc_pages_slowpath(). All other functions pass the whole strucure
+ * by a const pointer.
+ */
+struct alloc_context {
+ struct zonelist *zonelist;
+ nodemask_t *nodemask;
+ struct zone *preferred_zone;
+ int classzone_idx;
+ int migratetype;
+ enum zone_type high_zoneidx;
+};
+
+/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3f8a4f52a0c..095c1f96fbec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
-
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
#else
#define do_swap_account 0
#endif
-
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
"swap",
};
-enum mem_cgroup_events_index {
- MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
- MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
- MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
- MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
- MEM_CGROUP_EVENTS_NSTATS,
-};
-
static const char * const mem_cgroup_events_names[] = {
"pgpgin",
"pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
- unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+ unsigned long events[MEMCG_NR_EVENTS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
@@ -284,6 +267,10 @@ struct mem_cgroup {
struct page_counter memsw;
struct page_counter kmem;
+ /* Normal memory consumption range */
+ unsigned long low;
+ unsigned long high;
+
unsigned long soft_limit;
/* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
- atomic_t moving_account;
+ atomic_t moving_account;
/* taken only while moving_account > 0 */
- spinlock_t move_lock;
+ spinlock_t move_lock;
+ struct task_struct *move_lock_task;
+ unsigned long move_lock_flags;
/*
* percpu counter.
*/
@@ -371,21 +360,18 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
/* Stuffs for move charges at task migration. */
/*
- * Types of charges to be moved. "move_charge_at_immitgrate" and
- * "immigrate_flags" are treated as a left-shifted bitmap of these types.
+ * Types of charges to be moved.
*/
-enum move_type {
- MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
- MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
- NR_MOVE_TYPE,
-};
+#define MOVE_ANON 0x1U
+#define MOVE_FILE 0x2U
+#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
- unsigned long immigrate_flags;
+ unsigned long flags;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
@@ -396,16 +382,6 @@ static struct move_charge_struct {
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
-static bool move_anon(void)
-{
- return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
-}
-
-static bool move_file(void)
-{
- return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
-}
-
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
@@ -1365,6 +1341,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return true;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ memcg = mz->memcg;
+
+ return !!(memcg->css.flags & CSS_ONLINE);
+}
+
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1557,7 +1547,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* quickly exit and free its memory.
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- set_thread_flag(TIF_MEMDIE);
+ mark_tsk_oom_victim(current);
return;
}
@@ -1931,7 +1921,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (!memcg)
return false;
- if (!handle)
+ if (!handle || oom_killer_disabled)
goto cleanup;
owait.memcg = memcg;
@@ -1977,34 +1967,33 @@ cleanup:
/**
* mem_cgroup_begin_page_stat - begin a page state statistics transaction
* @page: page that is going to change accounted state
- * @locked: &memcg->move_lock slowpath was taken
- * @flags: IRQ-state flags for &memcg->move_lock
*
* This function must mark the beginning of an accounted page state
* change to prevent double accounting when the page is concurrently
* being moved to another memcg:
*
- * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ * memcg = mem_cgroup_begin_page_stat(page);
* if (TestClearPageState(page))
* mem_cgroup_update_page_stat(memcg, state, -1);
- * mem_cgroup_end_page_stat(memcg, locked, flags);
- *
- * The RCU lock is held throughout the transaction. The fast path can
- * get away without acquiring the memcg->move_lock (@locked is false)
- * because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when the page
- * state that is going to change is the only thing preventing the page
- * from being uncharged. E.g. end-writeback clearing PageWriteback(),
- * which allows migration to go ahead and uncharge the page before the
- * account transaction might be complete.
+ * mem_cgroup_end_page_stat(memcg);
*/
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
- bool *locked,
- unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
{
struct mem_cgroup *memcg;
+ unsigned long flags;
+ /*
+ * The RCU lock is held throughout the transaction. The fast
+ * path can get away without acquiring the memcg->move_lock
+ * because page moving starts with an RCU grace period.
+ *
+ * The RCU lock also protects the memcg from being freed when
+ * the page state that is going to change is the only thing
+ * preventing the page from being uncharged.
+ * E.g. end-writeback clearing PageWriteback(), which allows
+ * migration to go ahead and uncharge the page before the
+ * account transaction might be complete.
+ */
rcu_read_lock();
if (mem_cgroup_disabled())
@@ -2014,16 +2003,22 @@ again:
if (unlikely(!memcg))
return NULL;
- *locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
- spin_lock_irqsave(&memcg->move_lock, *flags);
+ spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
- spin_unlock_irqrestore(&memcg->move_lock, *flags);
+ spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
- *locked = true;
+
+ /*
+ * When charge migration first begins, we can have locked and
+ * unlocked page stat updates happening concurrently. Track
+ * the task who has the lock for mem_cgroup_end_page_stat().
+ */
+ memcg->move_lock_task = current;
+ memcg->move_lock_flags = flags;
return memcg;
}
@@ -2031,14 +2026,17 @@ again:
/**
* mem_cgroup_end_page_stat - finish a page state statistics transaction
* @memcg: the memcg that was accounted against
- * @locked: value received from mem_cgroup_begin_page_stat()
- * @flags: value received from mem_cgroup_begin_page_stat()
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
- unsigned long *flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
{
- if (memcg && *locked)
- spin_unlock_irqrestore(&memcg->move_lock, *flags);
+ if (memcg && memcg->move_lock_task == current) {
+ unsigned long flags = memcg->move_lock_flags;
+
+ memcg->move_lock_task = NULL;
+ memcg->move_lock_flags = 0;
+
+ spin_unlock_irqrestore(&memcg->move_lock, flags);
+ }
rcu_read_unlock();
}
@@ -2131,17 +2129,6 @@ static void drain_local_stock(struct work_struct *dummy)
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
-static void __init memcg_stock_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct memcg_stock_pcp *stock =
- &per_cpu(memcg_stock, cpu);
- INIT_WORK(&stock->work, drain_local_stock);
- }
-}
-
/*
* Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
@@ -2291,6 +2278,8 @@ retry:
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
+ mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
@@ -2332,6 +2321,8 @@ retry:
if (fatal_signal_pending(current))
goto bypass;
+ mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+
mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
@@ -2343,6 +2334,16 @@ done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
+ /*
+ * If the hierarchy is above the normal consumption range,
+ * make the charging task trim their excess contribution.
+ */
+ do {
+ if (page_counter_read(&memcg->memory) <= memcg->high)
+ continue;
+ mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ } while ((memcg = parent_mem_cgroup(memcg)));
done:
return ret;
}
@@ -3390,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
int ret;
buf = strstrip(buf);
- ret = page_counter_memparse(buf, &nr_pages);
+ ret = page_counter_memparse(buf, "-1", &nr_pages);
if (ret)
return ret;
@@ -3466,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- if (val >= (1 << NR_MOVE_TYPE))
+ if (val & ~MOVE_MASK)
return -EINVAL;
/*
@@ -3544,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
struct mem_cgroup *mi;
unsigned int i;
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
+ MEM_CGROUP_STAT_NSTATS);
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
+ MEM_CGROUP_EVENTS_NSTATS);
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3758,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
unsigned long usage;
int i, size, ret;
- ret = page_counter_memparse(args, &threshold);
+ ret = page_counter_memparse(args, "-1", &threshold);
if (ret)
return ret;
@@ -4248,7 +4253,7 @@ out_kfree:
return ret;
}
-static struct cftype mem_cgroup_files[] = {
+static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4359,34 +4364,6 @@ static struct cftype mem_cgroup_files[] = {
{ }, /* terminate */
};
-#ifdef CONFIG_MEMCG_SWAP
-static struct cftype memsw_cgroup_files[] = {
- {
- .name = "memsw.usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.failcnt",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
- },
- { }, /* terminate */
-};
-#endif
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -4482,29 +4459,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
- struct mem_cgroup_tree_per_node *rtpn;
- struct mem_cgroup_tree_per_zone *rtpz;
- int tmp, node, zone;
-
- for_each_node(node) {
- tmp = node;
- if (!node_state(node, N_NORMAL_MEMORY))
- tmp = -1;
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
- BUG_ON(!rtpn);
-
- soft_limit_tree.rb_tree_per_node[node] = rtpn;
-
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- rtpz = &rtpn->rb_tree_per_zone[zone];
- rtpz->rb_root = RB_ROOT;
- spin_lock_init(&rtpz->lock);
- }
- }
-}
-
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -4524,6 +4478,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent_css == NULL) {
root_mem_cgroup = memcg;
page_counter_init(&memcg->memory, NULL);
+ memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
@@ -4569,6 +4524,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent->use_hierarchy) {
page_counter_init(&memcg->memory, &parent->memory);
+ memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4579,6 +4535,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
*/
} else {
page_counter_init(&memcg->memory, NULL);
+ memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
@@ -4654,6 +4611,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+ memcg->low = 0;
+ memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
}
@@ -4730,12 +4689,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
if (!page || !page_mapped(page))
return NULL;
if (PageAnon(page)) {
- /* we don't move shared anon */
- if (!move_anon())
+ if (!(mc.flags & MOVE_ANON))
return NULL;
- } else if (!move_file())
- /* we ignore mapcount for file pages */
- return NULL;
+ } else {
+ if (!(mc.flags & MOVE_FILE))
+ return NULL;
+ }
if (!get_page_unless_zero(page))
return NULL;
@@ -4749,7 +4708,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
- if (!move_anon() || non_swap_entry(ent))
+ if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
/*
* Because lookup_swap_cache() updates some statistics counter,
@@ -4778,7 +4737,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
if (!vma->vm_file) /* anonymous vma */
return NULL;
- if (!move_file())
+ if (!(mc.flags & MOVE_FILE))
return NULL;
mapping = vma->vm_file->f_mapping;
@@ -4857,7 +4816,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
- if (!move_anon())
+ if (!(mc.flags & MOVE_ANON))
return ret;
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
@@ -4880,7 +4839,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
@@ -4906,20 +4865,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_count_precharge_walk = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .mm = mm,
+ };
down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_count_precharge_walk);
- }
+ walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
@@ -4999,15 +4951,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct task_struct *p = cgroup_taskset_first(tset);
int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- unsigned long move_charge_at_immigrate;
+ unsigned long move_flags;
/*
* We are now commited to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
- move_charge_at_immigrate = memcg->move_charge_at_immigrate;
- if (move_charge_at_immigrate) {
+ move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+ if (move_flags) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5027,7 +4979,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
spin_lock(&mc.lock);
mc.from = from;
mc.to = memcg;
- mc.immigrate_flags = move_charge_at_immigrate;
+ mc.flags = move_flags;
spin_unlock(&mc.lock);
/* We set mc.moving_task later */
@@ -5052,7 +5004,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
struct mm_walk *walk)
{
int ret = 0;
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
enum mc_target_type target_type;
@@ -5148,7 +5100,10 @@ put: /* get_mctgt_type() gets the page */
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
- struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_move_charge_walk = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+ .mm = mm,
+ };
lru_add_drain_all();
/*
@@ -5171,24 +5126,11 @@ retry:
cond_resched();
goto retry;
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- int ret;
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- ret = walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_move_charge_walk);
- if (ret)
- /*
- * means we have consumed all precharges and failed in
- * doing additional charge. Just abandon here.
- */
- break;
- }
+ /*
+ * When we have consumed all precharges and failed in doing
+ * additional charge, the page walk just aborts.
+ */
+ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
up_read(&mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
}
@@ -5239,118 +5181,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}
-struct cgroup_subsys memory_cgrp_subsys = {
- .css_alloc = mem_cgroup_css_alloc,
- .css_online = mem_cgroup_css_online,
- .css_offline = mem_cgroup_css_offline,
- .css_free = mem_cgroup_css_free,
- .css_reset = mem_cgroup_css_reset,
- .can_attach = mem_cgroup_can_attach,
- .cancel_attach = mem_cgroup_cancel_attach,
- .attach = mem_cgroup_move_task,
- .bind = mem_cgroup_bind,
- .legacy_cftypes = mem_cgroup_files,
- .early_init = 0,
-};
+static u64 memory_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+}
-#ifdef CONFIG_MEMCG_SWAP
-static int __init enable_swap_account(char *s)
+static int memory_low_show(struct seq_file *m, void *v)
{
- if (!strcmp(s, "1"))
- really_do_swap_account = 1;
- else if (!strcmp(s, "0"))
- really_do_swap_account = 0;
- return 1;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long low = ACCESS_ONCE(memcg->low);
+
+ if (low == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
+
+ return 0;
}
-__setup("swapaccount=", enable_swap_account);
-static void __init memsw_file_init(void)
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
- memsw_cgroup_files));
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long low;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &low);
+ if (err)
+ return err;
+
+ memcg->low = low;
+
+ return nbytes;
}
-static void __init enable_swap_cgroup(void)
+static int memory_high_show(struct seq_file *m, void *v)
{
- if (!mem_cgroup_disabled() && really_do_swap_account) {
- do_swap_account = 1;
- memsw_file_init();
- }
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long high = ACCESS_ONCE(memcg->high);
+
+ if (high == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
+
+ return 0;
}
-#else
-static void __init enable_swap_cgroup(void)
+static ssize_t memory_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &high);
+ if (err)
+ return err;
+
+ memcg->high = high;
+
+ return nbytes;
}
-#endif
-#ifdef CONFIG_MEMCG_SWAP
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @page to @entry.
- */
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+static int memory_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg;
- unsigned short oldid;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long max = ACCESS_ONCE(memcg->memory.limit);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
+ if (max == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
- if (!do_swap_account)
- return;
+ return 0;
+}
- memcg = page->mem_cgroup;
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
- /* Readahead page, never charged */
- if (!memcg)
- return;
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &max);
+ if (err)
+ return err;
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
- VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(memcg, true);
+ err = mem_cgroup_resize_limit(memcg, max);
+ if (err)
+ return err;
- page->mem_cgroup = NULL;
+ return nbytes;
+}
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, 1);
+static int memory_events_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- /* XXX: caller holds IRQ-safe mapping->tree_lock */
- VM_BUG_ON(!irqs_disabled());
+ seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
+ seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
+ seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
+ seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
- mem_cgroup_charge_statistics(memcg, page, -1);
- memcg_check_events(memcg, page);
+ return 0;
+}
+
+static struct cftype memory_files[] = {
+ {
+ .name = "current",
+ .read_u64 = memory_current_read,
+ },
+ {
+ .name = "low",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_low_show,
+ .write = memory_low_write,
+ },
+ {
+ .name = "high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_high_show,
+ .write = memory_high_write,
+ },
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_max_show,
+ .write = memory_max_write,
+ },
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_events_show,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys memory_cgrp_subsys = {
+ .css_alloc = mem_cgroup_css_alloc,
+ .css_online = mem_cgroup_css_online,
+ .css_offline = mem_cgroup_css_offline,
+ .css_free = mem_cgroup_css_free,
+ .css_reset = mem_cgroup_css_reset,
+ .can_attach = mem_cgroup_can_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
+ .attach = mem_cgroup_move_task,
+ .bind = mem_cgroup_bind,
+ .dfl_cftypes = memory_files,
+ .legacy_cftypes = mem_cgroup_legacy_files,
+ .early_init = 0,
+};
+
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+void mem_cgroup_events(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_index idx,
+ unsigned int nr)
+{
+ this_cpu_add(memcg->stat->events[idx], nr);
}
/**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
- * @entry: swap entry to uncharge
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
*
- * Drop the memsw charge associated with @entry.
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg;
- unsigned short id;
+ if (mem_cgroup_disabled())
+ return false;
- if (!do_swap_account)
- return;
+ /*
+ * The toplevel group doesn't have a configurable range, so
+ * it's never low when looked at directly, and it is not
+ * considered an ancestor when assessing the hierarchy.
+ */
- id = swap_cgroup_record(entry, 0);
- rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
- if (memcg) {
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memsw, 1);
- mem_cgroup_swap_statistics(memcg, false);
- css_put(&memcg->css);
+ if (memcg == root_mem_cgroup)
+ return false;
+
+ if (page_counter_read(&memcg->memory) > memcg->low)
+ return false;
+
+ while (memcg != root) {
+ memcg = parent_mem_cgroup(memcg);
+
+ if (memcg == root_mem_cgroup)
+ break;
+
+ if (page_counter_read(&memcg->memory) > memcg->low)
+ return false;
}
- rcu_read_unlock();
+ return true;
}
-#endif
/**
* mem_cgroup_try_charge - try charging a page
@@ -5684,10 +5719,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
*/
static int __init mem_cgroup_init(void)
{
+ int cpu, node;
+
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
- enable_swap_cgroup();
- mem_cgroup_soft_limit_tree_init();
- memcg_stock_init();
+
+ for_each_possible_cpu(cpu)
+ INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
+ drain_local_stock);
+
+ for_each_node(node) {
+ struct mem_cgroup_tree_per_node *rtpn;
+ int zone;
+
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
+ node_online(node) ? node : NUMA_NO_NODE);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct mem_cgroup_tree_per_zone *rtpz;
+
+ rtpz = &rtpn->rb_tree_per_zone[zone];
+ rtpz->rb_root = RB_ROOT;
+ spin_lock_init(&rtpz->lock);
+ }
+ soft_limit_tree.rb_tree_per_node[node] = rtpn;
+ }
+
return 0;
}
subsys_initcall(mem_cgroup_init);
+
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short oldid;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
+ if (!do_swap_account)
+ return;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return;
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
+
+ page->mem_cgroup = NULL;
+
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memory, 1);
+
+ /* XXX: caller holds IRQ-safe mapping->tree_lock */
+ VM_BUG_ON(!irqs_disabled());
+
+ mem_cgroup_charge_statistics(memcg, page, -1);
+ memcg_check_events(memcg, page);
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+
+ if (!do_swap_account)
+ return;
+
+ id = swap_cgroup_record(entry, 0);
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(id);
+ if (memcg) {
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memsw, 1);
+ mem_cgroup_swap_statistics(memcg, false);
+ css_put(&memcg->css);
+ }
+ rcu_read_unlock();
+}
+
+/* for remember boot option*/
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata;
+#endif
+
+static int __init enable_swap_account(char *s)
+{
+ if (!strcmp(s, "1"))
+ really_do_swap_account = 1;
+ else if (!strcmp(s, "0"))
+ really_do_swap_account = 0;
+ return 1;
+}
+__setup("swapaccount=", enable_swap_account);
+
+static struct cftype memsw_cgroup_files[] = {
+ {
+ .name = "memsw.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.failcnt",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ { }, /* terminate */
+};
+
+static int __init mem_cgroup_swap_init(void)
+{
+ if (!mem_cgroup_disabled() && really_do_swap_account) {
+ do_swap_account = 1;
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+ memsw_cgroup_files));
+ }
+ return 0;
+}
+subsys_initcall(mem_cgroup_swap_init);
+
+#endif /* CONFIG_MEMCG_SWAP */
diff --git a/mm/memory.c b/mm/memory.c
index d63849b5188f..bbe6a73a899d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
pmd = pmd_offset(pud, start);
pud_clear(pud);
pmd_free_tlb(tlb, pmd, start);
+ mm_dec_nr_pmds(tlb->mm);
}
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
- if (pud_present(*pud)) /* Another has populated it */
- pmd_free(mm, new);
- else
+ if (!pud_present(*pud)) {
+ mm_inc_nr_pmds(mm);
pud_populate(mm, pud, new);
-#else
- if (pgd_present(*pud)) /* Another has populated it */
+ } else /* Another has populated it */
pmd_free(mm, new);
- else
+#else
+ if (!pgd_present(*pud)) {
+ mm_inc_nr_pmds(mm);
pgd_populate(mm, pud, new);
+ } else /* Another has populated it */
+ pmd_free(mm, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
spin_unlock(&mm->page_table_lock);
return 0;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..f1bd23803576 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
+struct queue_pages {
+ struct list_head *pagelist;
+ unsigned long flags;
+ nodemask_t *nmask;
+ struct vm_area_struct *prev;
+};
+
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*/
-static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
- pte_t *orig_pte;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
+ int nid;
pte_t *pte;
spinlock_t *ptl;
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- do {
- struct page *page;
- int nid;
+ split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_trans_unstable(pmd))
+ return 0;
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (PageReserved(page))
continue;
nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, private, flags);
- else
- break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(orig_pte, ptl);
- return addr != end;
+ migrate_page_add(page, qp->pagelist, flags);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ return 0;
}
-static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
- pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
int nid;
struct page *page;
spinlock_t *ptl;
pte_t entry;
- ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
- entry = huge_ptep_get((pte_t *)pmd);
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
goto unlock;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
- isolate_huge_page(page, private);
+ isolate_huge_page(page, qp->pagelist);
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
-}
-
-static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (!pmd_present(*pmd))
- continue;
- if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
- queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
- flags, private);
- continue;
- }
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- continue;
- if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pmd++, addr = next, addr != end);
- return 0;
-}
-
-static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
- continue;
- if (pud_none_or_clear_bad(pud))
- continue;
- if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pud++, addr = next, addr != end);
- return 0;
-}
-
-static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pgd_t *pgd;
- unsigned long next;
-
- pgd = pgd_offset(vma->vm_mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
- if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pgd++, addr = next, addr != end);
return 0;
}
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
}
#endif /* CONFIG_NUMA_BALANCING */
+static int queue_pages_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct queue_pages *qp = walk->private;
+ unsigned long endvma = vma->vm_end;
+ unsigned long flags = qp->flags;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
+ if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+ if (!vma->vm_next && vma->vm_end < end)
+ return -EFAULT;
+ if (qp->prev && qp->prev->vm_end < vma->vm_start)
+ return -EFAULT;
+ }
+
+ qp->prev = vma;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (flags & MPOL_MF_LAZY) {
+ /* Similar to task_numa_work, skip inaccessible VMAs */
+ if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ change_prot_numa(vma, start, endvma);
+ return 1;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
+ ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+ vma_migratable(vma)))
+ /* queue pages from current vma */
+ return 0;
+ return 1;
+}
+
/*
* Walk through page tables and collect pages to be migrated.
*
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- const nodemask_t *nodes, unsigned long flags, void *private)
-{
- int err = 0;
- struct vm_area_struct *vma, *prev;
-
- vma = find_vma(mm, start);
- if (!vma)
- return -EFAULT;
- prev = NULL;
- for (; vma && vma->vm_start < end; vma = vma->vm_next) {
- unsigned long endvma = vma->vm_end;
-
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
-
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
- return -EFAULT;
- if (prev && prev->vm_end < vma->vm_start)
- return -EFAULT;
- }
-
- if (flags & MPOL_MF_LAZY) {
- /* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
- change_prot_numa(vma, start, endvma);
- goto next;
- }
-
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma))) {
-
- err = queue_pages_pgd_range(vma, start, endvma, nodes,
- flags, private);
- if (err)
- break;
- }
-next:
- prev = vma;
- }
- return err;
+ nodemask_t *nodes, unsigned long flags,
+ struct list_head *pagelist)
+{
+ struct queue_pages qp = {
+ .pagelist = pagelist,
+ .flags = flags,
+ .nmask = nodes,
+ .prev = NULL,
+ };
+ struct mm_walk queue_pages_walk = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pmd_entry = queue_pages_pte_range,
+ .test_walk = queue_pages_test_walk,
+ .mm = mm,
+ .private = &qp,
+ };
+
+ return walk_page_range(start, end, &queue_pages_walk);
}
/*
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
+ * @node: Which node to prefer for allocation (modulo policy).
+ * @hugepage: for hugepages try only the preferred node if possible
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
* When VMA is not NULL caller must hold down_read on the mmap_sem of the
* mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into
- * user space. Returns NULL when no page can be allocated.
- *
- * Should be called with the mm_sem of the vma hold.
+ * all allocations for pages that will be mapped into user space. Returns
+ * NULL when no page can be allocated.
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, int node)
+ unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
struct page *page;
unsigned int cpuset_mems_cookie;
+ struct zonelist *zl;
+ nodemask_t *nmask;
retry_cpuset:
pol = get_vma_policy(vma, addr);
cpuset_mems_cookie = read_mems_allowed_begin();
- if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+ if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
+ pol->mode != MPOL_INTERLEAVE)) {
+ /*
+ * For hugepage allocation and non-interleave policy which
+ * allows the current node, we only try to allocate from the
+ * current node and don't fall back to other nodes, as the
+ * cost of remote accesses would likely offset THP benefits.
+ *
+ * If the policy is interleave, or does not allow the current
+ * node in its nodemask, we allocate the standard way.
+ */
+ nmask = policy_nodemask(gfp, pol);
+ if (!nmask || node_isset(node, *nmask)) {
+ mpol_cond_put(pol);
+ page = alloc_pages_exact_node(node, gfp, order);
+ goto out;
+ }
+ }
+
+ if (pol->mode == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
-
- return page;
+ goto out;
}
- page = __alloc_pages_nodemask(gfp, order,
- policy_zonelist(gfp, pol, node),
- policy_nodemask(gfp, pol));
+
+ nmask = policy_nodemask(gfp, pol);
+ zl = policy_zonelist(gfp, pol, node);
mpol_cond_put(pol);
+ page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+out:
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6e284bcca8bb..f98067e5d353 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl)
{
pte_t pte;
@@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto put_and_set;
if (PageHuge(page)) {
- isolate_huge_page(page, &pagelist);
+ if (PageHead(page))
+ isolate_huge_page(page, &pagelist);
goto put_and_set;
}
diff --git a/mm/mincore.c b/mm/mincore.c
index 46527c023e0c..be25efde64a4 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,38 +19,25 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
- struct hstate *h;
+ unsigned char present;
+ unsigned char *vec = walk->private;
- h = hstate_vma(vma);
- while (1) {
- unsigned char present;
- pte_t *ptep;
- /*
- * Huge pages are always in RAM for now, but
- * theoretically it needs to be checked.
- */
- ptep = huge_pte_offset(current->mm,
- addr & huge_page_mask(h));
- present = ptep && !huge_pte_none(huge_ptep_get(ptep));
- while (1) {
- *vec = present;
- vec++;
- addr += PAGE_SIZE;
- if (addr == end)
- return;
- /* check hugepage border */
- if (!(addr & ~huge_page_mask(h)))
- break;
- }
- }
+ /*
+ * Hugepages under user process are always in RAM and never
+ * swapped out, but theoretically it needs to be checked.
+ */
+ present = pte && !huge_pte_none(huge_ptep_get(pte));
+ for (; addr != end; vec++, addr += PAGE_SIZE)
+ *vec = present;
+ walk->private = vec;
#else
BUG();
#endif
+ return 0;
}
/*
@@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
return present;
}
-static void mincore_unmapped_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct vm_area_struct *vma, unsigned char *vec)
{
unsigned long nr = (end - addr) >> PAGE_SHIFT;
int i;
@@ -111,23 +97,44 @@ static void mincore_unmapped_range(struct vm_area_struct *vma,
for (i = 0; i < nr; i++)
vec[i] = 0;
}
+ return nr;
+}
+
+static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ walk->private += __mincore_unmapped_range(addr, end,
+ walk->vma, walk->private);
+ return 0;
}
-static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
- unsigned long next;
spinlock_t *ptl;
+ struct vm_area_struct *vma = walk->vma;
pte_t *ptep;
+ unsigned char *vec = walk->private;
+ int nr = (end - addr) >> PAGE_SHIFT;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ memset(vec, 1, nr);
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ if (pmd_trans_unstable(pmd)) {
+ __mincore_unmapped_range(addr, end, vma, vec);
+ goto out;
+ }
- ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- do {
+ ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; ptep++, addr += PAGE_SIZE) {
pte_t pte = *ptep;
- next = addr + PAGE_SIZE;
if (pte_none(pte))
- mincore_unmapped_range(vma, addr, next, vec);
+ __mincore_unmapped_range(addr, addr + PAGE_SIZE,
+ vma, vec);
else if (pte_present(pte))
*vec = 1;
else { /* pte is a swap entry */
@@ -150,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
}
vec++;
- } while (ptep++, addr = next, addr != end);
+ }
pte_unmap_unlock(ptep - 1, ptl);
-}
-
-static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pmd_t *pmd;
-
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd)) {
- if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
- vec += (next - addr) >> PAGE_SHIFT;
- continue;
- }
- /* fall through */
- }
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pte_range(vma, pmd, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pmd++, addr = next, addr != end);
-}
-
-static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pud_t *pud;
-
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pmd_range(vma, pud, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pud++, addr = next, addr != end);
-}
-
-static void mincore_page_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pgd_t *pgd;
-
- pgd = pgd_offset(vma->vm_mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pud_range(vma, pgd, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pgd++, addr = next, addr != end);
+out:
+ walk->private += nr;
+ cond_resched();
+ return 0;
}
/*
@@ -224,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
{
struct vm_area_struct *vma;
unsigned long end;
+ int err;
+ struct mm_walk mincore_walk = {
+ .pmd_entry = mincore_pte_range,
+ .pte_hole = mincore_unmapped_range,
+ .hugetlb_entry = mincore_hugetlb,
+ .private = vec,
+ };
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
return -ENOMEM;
-
+ mincore_walk.mm = vma->vm_mm;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
-
- if (is_vm_hugetlb_page(vma))
- mincore_hugetlb_page_range(vma, addr, end, vec);
- else
- mincore_page_range(vma, addr, end, vec);
-
+ err = walk_page_range(addr, end, &mincore_walk);
+ if (err < 0)
+ return err;
return (end - addr) >> PAGE_SHIFT;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 14d84666e8ba..da9990acc08b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed, reserve;
+ long free, allowed, reserve;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
@@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min(mm->total_vm / 32, reserve);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@ -2851,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm)
vma = remove_vma(vma);
}
vm_unacct_memory(nr_accounted);
-
- WARN_ON(atomic_long_read(&mm->nr_ptes) >
- (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
/* Insert vm structure into process list sorted by address
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8556db..7d87ebb0d632 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
- nodemask_t *nodes,
- struct zone **zone)
+ nodemask_t *nodes)
{
/*
* Find the next suitable zone to use for the allocation.
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
(z->zone && !zref_in_nodemask(z, nodes)))
z++;
- *zone = zonelist_zone(z);
return z;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 541bed64e348..1a19fb3b0463 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -214,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
+{
+ return get_user_pages(tsk, mm, start, nr_pages, write, force,
+ pages, NULL);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ unsigned int gup_flags)
+{
+ long ret;
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
+ pages, NULL);
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+ force, pages, 0);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -1895,7 +1928,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed, reserve;
+ long free, allowed, reserve;
vm_acct_memory(pages);
@@ -1959,7 +1992,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min(mm->total_vm / 32, reserve);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d503e9ce1c7b..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
- get_mm_counter(p->mm, MM_SWAPENTS);
+ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+ atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
task_unlock(p);
/*
@@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (unlikely(frozen(task)))
- __thaw_task(task);
if (!force_kill)
return OOM_SCAN_ABORT;
}
@@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
struct task_struct *p;
struct task_struct *task;
- pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
+ pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
@@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
atomic_long_read(&task->mm->nr_ptes),
+ mm_nr_pmds(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
@@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
}
/*
- * Number of OOM killer invocations (including memcg OOM killer).
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
+ * Number of OOM victims in flight
*/
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
-int oom_kills_count(void)
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
+
+/**
+ * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
+ */
+void mark_tsk_oom_victim(struct task_struct *tsk)
{
- return atomic_read(&oom_kills);
+ WARN_ON(oom_killer_disabled);
+ /* OOM killer might race with memcg OOM */
+ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
+ /*
+ * Make sure that the task is woken up from uninterruptible sleep
+ * if it is frozen because OOM killer wouldn't be able to free
+ * any memory and livelock. freezing_slow_path will tell the freezer
+ * that TIF_MEMDIE tasks should be ignored.
+ */
+ __thaw_task(tsk);
+ atomic_inc(&oom_victims);
+}
+
+/**
+ * unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
+ */
+void unmark_oom_victim(void)
+{
+ if (!test_and_clear_thread_flag(TIF_MEMDIE))
+ return;
+
+ down_read(&oom_sem);
+ /*
+ * There is no need to signal the lasst oom_victim if there
+ * is nobody who cares.
+ */
+ if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+ wake_up_all(&oom_victims_wait);
+ up_read(&oom_sem);
+}
+
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+ /*
+ * Make sure to not race with an ongoing OOM killer
+ * and that the current is not the victim.
+ */
+ down_write(&oom_sem);
+ if (test_thread_flag(TIF_MEMDIE)) {
+ up_write(&oom_sem);
+ return false;
+ }
+
+ oom_killer_disabled = true;
+ up_write(&oom_sem);
+
+ wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+
+ return true;
}
-void note_oom_kill(void)
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
{
- atomic_inc(&oom_kills);
+ down_write(&oom_sem);
+ oom_killer_disabled = false;
+ up_write(&oom_sem);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
- if (task_will_free_mem(p)) {
- set_tsk_thread_flag(p, TIF_MEMDIE);
+ task_lock(p);
+ if (p->mm && task_will_free_mem(p)) {
+ mark_tsk_oom_victim(p);
+ task_unlock(p);
put_task_struct(p);
return;
}
+ task_unlock(p);
if (__ratelimit(&oom_rs))
dump_header(p, gfp_mask, order, memcg, nodemask);
@@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/* mm cannot safely be dereferenced after task_unlock(victim) */
mm = victim->mm;
+ mark_tsk_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
}
rcu_read_unlock();
- set_tsk_thread_flag(victim, TIF_MEMDIE);
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
put_task_struct(victim);
}
@@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
}
/**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
@@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
@@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
+ *
+ * But don't select if current has already released its mm and cleared
+ * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
*/
- if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- set_thread_flag(TIF_MEMDIE);
+ if (current->mm &&
+ (fatal_signal_pending(current) || task_will_free_mem(current))) {
+ mark_tsk_oom_victim(current);
return;
}
@@ -688,6 +772,32 @@ out:
schedule_timeout_killable(1);
}
+/**
+ * out_of_memory - tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+ int order, nodemask_t *nodemask, bool force_kill)
+{
+ bool ret = false;
+
+ down_read(&oom_sem);
+ if (!oom_killer_disabled) {
+ __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+ ret = true;
+ }
+ up_read(&oom_sem);
+
+ return ret;
+}
+
/*
* The pagefault handler calls here because it is out of memory, so kill a
* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
@@ -697,12 +807,25 @@ void pagefault_out_of_memory(void)
{
struct zonelist *zonelist;
+ down_read(&oom_sem);
if (mem_cgroup_oom_synchronize(true))
- return;
+ goto unlock;
zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
- out_of_memory(NULL, 0, 0, NULL, false);
+ if (!oom_killer_disabled)
+ __out_of_memory(NULL, 0, 0, NULL, false);
+ else
+ /*
+ * There shouldn't be any user tasks runable while the
+ * OOM killer is disabled so the current task has to
+ * be a racing OOM victim for which oom_killer_disable()
+ * is waiting for.
+ */
+ WARN_ON(test_thread_flag(TIF_MEMDIE));
+
oom_zonelist_unlock(zonelist, GFP_KERNEL);
}
+unlock:
+ up_read(&oom_sem);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f4335238e33..6a73e47e81c6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2168,9 +2168,12 @@ EXPORT_SYMBOL(account_page_redirty);
*/
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
+ int ret;
+
wbc->pages_skipped++;
+ ret = __set_page_dirty_nobuffers(page);
account_page_redirty(page);
- return __set_page_dirty_nobuffers(page);
+ return ret;
}
EXPORT_SYMBOL(redirty_page_for_writepage);
@@ -2308,12 +2311,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- unsigned long memcg_flags;
struct mem_cgroup *memcg;
- bool locked;
int ret;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2338,19 +2339,17 @@ int test_clear_page_writeback(struct page *page)
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- unsigned long memcg_flags;
struct mem_cgroup *memcg;
- bool locked;
int ret;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2380,7 +2379,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f121050e8530..8d52ab18fe0d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
PB_migrate, PB_migrate_end);
}
-bool oom_killer_disabled __read_mostly;
-
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
@@ -381,36 +379,6 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
-/* update __split_huge_page_refcount if you change this function */
-static int destroy_compound_page(struct page *page, unsigned long order)
-{
- int i;
- int nr_pages = 1 << order;
- int bad = 0;
-
- if (unlikely(compound_order(page) != order)) {
- bad_page(page, "wrong compound order", 0);
- bad++;
- }
-
- __ClearPageHead(page);
-
- for (i = 1; i < nr_pages; i++) {
- struct page *p = page + i;
-
- if (unlikely(!PageTail(p))) {
- bad_page(page, "PageTail not set", 0);
- bad++;
- } else if (unlikely(p->first_page != page)) {
- bad_page(page, "first_page not consistent", 0);
- bad++;
- }
- __ClearPageTail(p);
- }
-
- return bad;
-}
-
static inline void prep_zero_page(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
@@ -613,10 +581,7 @@ static inline void __free_one_page(struct page *page,
int max_order = MAX_ORDER;
VM_BUG_ON(!zone_is_initialized(zone));
-
- if (unlikely(PageCompound(page)))
- if (unlikely(destroy_compound_page(page, order)))
- return;
+ VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (is_migrate_isolate(migratetype)) {
@@ -797,21 +762,40 @@ static void free_one_page(struct zone *zone,
spin_unlock(&zone->lock);
}
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_VM))
+ return 0;
+ if (unlikely(!PageTail(page))) {
+ bad_page(page, "PageTail not set", 0);
+ return 1;
+ }
+ if (unlikely(page->first_page != head_page)) {
+ bad_page(page, "first_page not consistent", 0);
+ return 1;
+ }
+ return 0;
+}
+
static bool free_pages_prepare(struct page *page, unsigned int order)
{
- int i;
- int bad = 0;
+ bool compound = PageCompound(page);
+ int i, bad = 0;
VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
+ VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
if (PageAnon(page))
page->mapping = NULL;
- for (i = 0; i < (1 << order); i++)
+ bad += free_pages_check(page);
+ for (i = 1; i < (1 << order); i++) {
+ if (compound)
+ bad += free_tail_pages_check(page, page + i);
bad += free_pages_check(page + i);
+ }
if (bad)
return false;
@@ -970,7 +954,8 @@ static inline int check_new_page(struct page *page)
return 0;
}
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+ int alloc_flags)
{
int i;
@@ -994,6 +979,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
set_page_owner(page, order, gfp_flags);
+ /*
+ * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+ * allocate the page. The expectation is that the caller is taking
+ * steps that will free more memory. The caller should avoid the page
+ * being used for !PFMEMALLOC purposes.
+ */
+ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+
return 0;
}
@@ -1130,39 +1123,34 @@ static void change_pageblock_range(struct page *pageblock_page,
}
/*
- * If breaking a large block of pages, move all free pages to the preferred
- * allocation list. If falling back for a reclaimable kernel allocation, be
- * more aggressive about taking ownership of free pages.
+ * When we are falling back to another migratetype during allocation, try to
+ * steal extra free pages from the same pageblocks to satisfy further
+ * allocations, instead of polluting multiple pageblocks.
*
- * On the other hand, never change migration type of MIGRATE_CMA pageblocks
- * nor move CMA pages to different free lists. We don't want unmovable pages
- * to be allocated from MIGRATE_CMA areas.
+ * If we are stealing a relatively large buddy page, it is likely there will
+ * be more free pages in the pageblock, so try to steal them all. For
+ * reclaimable and unmovable allocations, we steal regardless of page size,
+ * as fragmentation caused by those allocations polluting movable pageblocks
+ * is worse than movable allocations stealing from unmovable and reclaimable
+ * pageblocks.
*
- * Returns the new migratetype of the pageblock (or the same old migratetype
- * if it was unchanged).
+ * If we claim more than half of the pageblock, change pageblock's migratetype
+ * as well.
*/
-static int try_to_steal_freepages(struct zone *zone, struct page *page,
+static void try_to_steal_freepages(struct zone *zone, struct page *page,
int start_type, int fallback_type)
{
int current_order = page_order(page);
- /*
- * When borrowing from MIGRATE_CMA, we need to release the excess
- * buddy pages to CMA itself. We also ensure the freepage_migratetype
- * is set to CMA so it is returned to the correct freelist in case
- * the page ends up being not actually allocated from the pcp lists.
- */
- if (is_migrate_cma(fallback_type))
- return fallback_type;
-
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
- return start_type;
+ return;
}
if (current_order >= pageblock_order / 2 ||
start_type == MIGRATE_RECLAIMABLE ||
+ start_type == MIGRATE_UNMOVABLE ||
page_group_by_mobility_disabled) {
int pages;
@@ -1170,15 +1158,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
/* Claim the whole block if over half of it is free */
if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled) {
-
+ page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
- return start_type;
- }
-
}
-
- return fallback_type;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1188,14 +1170,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
- int migratetype, new_type, i;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
+ int i;
for (i = 0;; i++) {
- migratetype = fallbacks[start_migratetype][i];
+ int migratetype = fallbacks[start_migratetype][i];
+ int buddy_type = start_migratetype;
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
@@ -1209,25 +1192,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct page, lru);
area->nr_free--;
- new_type = try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
+ if (!is_migrate_cma(migratetype)) {
+ try_to_steal_freepages(zone, page,
+ start_migratetype,
+ migratetype);
+ } else {
+ /*
+ * When borrowing from MIGRATE_CMA, we need to
+ * release the excess buddy pages to CMA
+ * itself, and we do not try to steal extra
+ * free pages.
+ */
+ buddy_type = migratetype;
+ }
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
expand(zone, page, order, current_order, area,
- new_type);
- /* The freepage_migratetype may differ from pageblock's
+ buddy_type);
+
+ /*
+ * The freepage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages. This is OK as long as it does
- * not differ for MIGRATE_CMA type.
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
*/
- set_freepage_migratetype(page, new_type);
+ set_freepage_migratetype(page, buddy_type);
trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype, new_type);
+ start_migratetype, migratetype);
return page;
}
@@ -1642,9 +1639,7 @@ int split_free_page(struct page *page)
}
/*
- * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
- * we cheat by calling it from here, in the order > 0 path. Saves a branch
- * or two.
+ * Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
@@ -1655,7 +1650,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
struct page *page;
bool cold = ((gfp_flags & __GFP_COLD) != 0);
-again:
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -1711,8 +1705,6 @@ again:
local_irq_restore(flags);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
- if (prep_new_page(page, order, gfp_flags))
- goto again;
return page;
failed:
@@ -2033,10 +2025,10 @@ static void reset_alloc_batches(struct zone *preferred_zone)
* a page.
*/
static struct page *
-get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
- struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int classzone_idx, int migratetype)
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
+ const struct alloc_context *ac)
{
+ struct zonelist *zonelist = ac->zonelist;
struct zoneref *z;
struct page *page = NULL;
struct zone *zone;
@@ -2055,8 +2047,8 @@ zonelist_scan:
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ ac->nodemask) {
unsigned long mark;
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
@@ -2073,7 +2065,7 @@ zonelist_scan:
* time the page has in memory before being reclaimed.
*/
if (alloc_flags & ALLOC_FAIR) {
- if (!zone_local(preferred_zone, zone))
+ if (!zone_local(ac->preferred_zone, zone))
break;
if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
nr_fair_skipped++;
@@ -2111,7 +2103,7 @@ zonelist_scan:
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags)) {
+ ac->classzone_idx, alloc_flags)) {
int ret;
/* Checked here to keep the fast path fast */
@@ -2132,7 +2124,7 @@ zonelist_scan:
}
if (zone_reclaim_mode == 0 ||
- !zone_allows_reclaim(preferred_zone, zone))
+ !zone_allows_reclaim(ac->preferred_zone, zone))
goto this_zone_full;
/*
@@ -2154,7 +2146,7 @@ zonelist_scan:
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
+ ac->classzone_idx, alloc_flags))
goto try_this_zone;
/*
@@ -2175,27 +2167,18 @@ zonelist_scan:
}
try_this_zone:
- page = buffered_rmqueue(preferred_zone, zone, order,
- gfp_mask, migratetype);
- if (page)
- break;
+ page = buffered_rmqueue(ac->preferred_zone, zone, order,
+ gfp_mask, ac->migratetype);
+ if (page) {
+ if (prep_new_page(page, order, gfp_mask, alloc_flags))
+ goto try_this_zone;
+ return page;
+ }
this_zone_full:
if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
zlc_mark_zone_full(zonelist, z);
}
- if (page) {
- /*
- * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
- * necessary to allocate the page. The expectation is
- * that the caller is taking steps that will free more
- * memory. The caller should avoid the page being used
- * for !PFMEMALLOC purposes.
- */
- page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
- return page;
- }
-
/*
* The first pass makes sure allocations are spread fairly within the
* local node. However, the local node might have free pages left
@@ -2208,7 +2191,7 @@ this_zone_full:
alloc_flags &= ~ALLOC_FAIR;
if (nr_fair_skipped) {
zonelist_rescan = true;
- reset_alloc_batches(preferred_zone);
+ reset_alloc_batches(ac->preferred_zone);
}
if (nr_online_nodes > 1)
zonelist_rescan = true;
@@ -2330,44 +2313,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype, unsigned long *did_some_progress)
+ const struct alloc_context *ac, unsigned long *did_some_progress)
{
struct page *page;
*did_some_progress = 0;
- if (oom_killer_disabled)
- return NULL;
-
/*
* Acquire the per-zone oom lock for each zone. If that
* fails, somebody else is making progress for us.
*/
- if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
+ if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
*did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
}
/*
- * PM-freezer should be notified that there might be an OOM killer on
- * its way to kill and wake somebody up. This is too early and we might
- * end up not killing anything but false positives are acceptable.
- * See freeze_processes.
- */
- note_oom_kill();
-
- /*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
*/
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
- order, zonelist, high_zoneidx,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET,
- preferred_zone, classzone_idx, migratetype);
+ page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
@@ -2379,7 +2347,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
- if (high_zoneidx < ZONE_NORMAL)
+ if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
/* The OOM killer does not compensate for light reclaim */
if (!(gfp_mask & __GFP_FS))
@@ -2395,10 +2363,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- out_of_memory(zonelist, gfp_mask, order, nodemask, false);
- *did_some_progress = 1;
+ if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+ *did_some_progress = 1;
out:
- oom_zonelist_unlock(zonelist, gfp_mask);
+ oom_zonelist_unlock(ac->zonelist, gfp_mask);
return page;
}
@@ -2406,10 +2374,9 @@ out:
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype, enum migrate_mode mode,
- int *contended_compaction, bool *deferred_compaction)
+ int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended_compaction,
+ bool *deferred_compaction)
{
unsigned long compact_result;
struct page *page;
@@ -2418,10 +2385,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
current->flags |= PF_MEMALLOC;
- compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask, mode,
- contended_compaction,
- alloc_flags, classzone_idx);
+ compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+ mode, contended_compaction);
current->flags &= ~PF_MEMALLOC;
switch (compact_result) {
@@ -2440,10 +2405,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, nodemask,
- order, zonelist, high_zoneidx,
- alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -2467,10 +2430,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype, enum migrate_mode mode,
- int *contended_compaction, bool *deferred_compaction)
+ int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended_compaction,
+ bool *deferred_compaction)
{
return NULL;
}
@@ -2478,8 +2440,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
/* Perform direct synchronous page reclaim */
static int
-__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
- nodemask_t *nodemask)
+__perform_reclaim(gfp_t gfp_mask, unsigned int order,
+ const struct alloc_context *ac)
{
struct reclaim_state reclaim_state;
int progress;
@@ -2493,7 +2455,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
- progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+ progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
+ ac->nodemask);
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
@@ -2507,28 +2470,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype, unsigned long *did_some_progress)
+ int alloc_flags, const struct alloc_context *ac,
+ unsigned long *did_some_progress)
{
struct page *page = NULL;
bool drained = false;
- *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
- nodemask);
+ *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
if (unlikely(!(*did_some_progress)))
return NULL;
/* After successful reclaim, reconsider all zones for allocation */
if (IS_ENABLED(CONFIG_NUMA))
- zlc_clear_zones_full(zonelist);
+ zlc_clear_zones_full(ac->zonelist);
retry:
- page = get_page_from_freelist(gfp_mask, nodemask, order,
- zonelist, high_zoneidx,
- alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx,
- migratetype);
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -2549,36 +2507,30 @@ retry:
*/
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ const struct alloc_context *ac)
{
struct page *page;
do {
- page = get_page_from_freelist(gfp_mask, nodemask, order,
- zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS, ac);
if (!page && gfp_mask & __GFP_NOFAIL)
- wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
+ HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
}
-static void wake_all_kswapds(unsigned int order,
- struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone,
- nodemask_t *nodemask)
+static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->high_zoneidx, ac->nodemask)
+ wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
}
static inline int
@@ -2637,9 +2589,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ struct alloc_context *ac)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
@@ -2675,8 +2625,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
retry:
if (!(gfp_mask & __GFP_NO_KSWAPD))
- wake_all_kswapds(order, zonelist, high_zoneidx,
- preferred_zone, nodemask);
+ wake_all_kswapds(order, ac);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2689,17 +2638,16 @@ retry:
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
+ if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
struct zoneref *preferred_zoneref;
- preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
- NULL, &preferred_zone);
- classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, NULL, &ac->preferred_zone);
+ ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
}
/* This is the last chance, in general, before the goto nopage. */
- page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
- high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
if (page)
goto got_pg;
@@ -2710,11 +2658,10 @@ retry:
* the allocation is high priority and these type of
* allocations are system rather than user orientated
*/
- zonelist = node_zonelist(numa_node_id(), gfp_mask);
+ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
+
+ page = __alloc_pages_high_priority(gfp_mask, order, ac);
- page = __alloc_pages_high_priority(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, classzone_idx, migratetype);
if (page) {
goto got_pg;
}
@@ -2743,11 +2690,9 @@ retry:
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
- page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
- high_zoneidx, nodemask, alloc_flags,
- preferred_zone,
- classzone_idx, migratetype,
- migration_mode, &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+ migration_mode,
+ &contended_compaction,
&deferred_compaction);
if (page)
goto got_pg;
@@ -2793,12 +2738,8 @@ retry:
migration_mode = MIGRATE_SYNC_LIGHT;
/* Try direct reclaim and then allocating */
- page = __alloc_pages_direct_reclaim(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- classzone_idx, migratetype,
- &did_some_progress);
+ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
+ &did_some_progress);
if (page)
goto got_pg;
@@ -2812,17 +2753,15 @@ retry:
* start OOM killing tasks.
*/
if (!did_some_progress) {
- page = __alloc_pages_may_oom(gfp_mask, order, zonelist,
- high_zoneidx, nodemask,
- preferred_zone, classzone_idx,
- migratetype,&did_some_progress);
+ page = __alloc_pages_may_oom(gfp_mask, order, ac,
+ &did_some_progress);
if (page)
goto got_pg;
if (!did_some_progress)
goto nopage;
}
/* Wait for some write requests to complete then retry */
- wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
goto retry;
} else {
/*
@@ -2830,11 +2769,9 @@ retry:
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
- page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
- high_zoneidx, nodemask, alloc_flags,
- preferred_zone,
- classzone_idx, migratetype,
- migration_mode, &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ alloc_flags, ac, migration_mode,
+ &contended_compaction,
&deferred_compaction);
if (page)
goto got_pg;
@@ -2842,11 +2779,7 @@ retry:
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
- return page;
got_pg:
- if (kmemcheck_enabled)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
return page;
}
@@ -2857,14 +2790,16 @@ struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zone *preferred_zone;
struct zoneref *preferred_zoneref;
struct page *page = NULL;
- int migratetype = gfpflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
- int classzone_idx;
+ gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ struct alloc_context ac = {
+ .high_zoneidx = gfp_zone(gfp_mask),
+ .nodemask = nodemask,
+ .migratetype = gfpflags_to_migratetype(gfp_mask),
+ };
gfp_mask &= gfp_allowed_mask;
@@ -2883,37 +2818,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+ if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
+ /* We set it here, as __alloc_pages_slowpath might have changed it */
+ ac.zonelist = zonelist;
/* The preferred zone is used for statistics later */
- preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
- nodemask ? : &cpuset_current_mems_allowed,
- &preferred_zone);
- if (!preferred_zone)
+ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
+ ac.nodemask ? : &cpuset_current_mems_allowed,
+ &ac.preferred_zone);
+ if (!ac.preferred_zone)
goto out;
- classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
/* First allocation attempt */
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, alloc_flags,
- preferred_zone, classzone_idx, migratetype);
+ alloc_mask = gfp_mask|__GFP_HARDWALL;
+ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (unlikely(!page)) {
/*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
*/
- gfp_mask = memalloc_noio_flags(gfp_mask);
- page = __alloc_pages_slowpath(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, classzone_idx, migratetype);
+ alloc_mask = memalloc_noio_flags(gfp_mask);
+
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
}
- trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+ if (kmemcheck_enabled && page)
+ kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
+ trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
out:
/*
@@ -5047,8 +4985,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
- printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
- (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
+ pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5420,9 +5358,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
else
- pr_cont("[mem %0#10lx-%0#10lx]\n",
- arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
- (arch_zone_highest_possible_pfn[i]
+ pr_cont("[mem %#018Lx-%#018Lx]\n",
+ (u64)arch_zone_lowest_possible_pfn[i]
+ << PAGE_SHIFT,
+ ((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
@@ -5430,15 +5369,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
- pr_info(" Node %d: %#010lx\n", i,
- zone_movable_pfn[i] << PAGE_SHIFT);
+ pr_info(" Node %d: %#018Lx\n", i,
+ (u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/* Print out the early node map */
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
- pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid,
- start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
+ pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ ((u64)end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
mminit_verify_pageflags_layout();
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a009574fbba9..11b4beda14ba 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
/**
* page_counter_memparse - memparse() for page counter limits
* @buf: string to parse
+ * @max: string meaning maximum possible value
* @nr_pages: returns the result in number of pages
*
* Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
* limited to %PAGE_COUNTER_MAX.
*/
-int page_counter_memparse(const char *buf, unsigned long *nr_pages)
+int page_counter_memparse(const char *buf, const char *max,
+ unsigned long *nr_pages)
{
- char unlimited[] = "-1";
char *end;
u64 bytes;
- if (!strncmp(buf, unlimited, sizeof(unlimited))) {
+ if (!strcmp(buf, max)) {
*nr_pages = PAGE_COUNTER_MAX;
return 0;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9ab4a9b5bc09..0993f5f36b01 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order)
void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
{
- struct page_ext *page_ext;
- struct stack_trace *trace;
-
- page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .max_entries = ARRAY_SIZE(page_ext->trace_entries),
+ .entries = &page_ext->trace_entries[0],
+ .skip = 3,
+ };
- trace = &page_ext->trace;
- trace->nr_entries = 0;
- trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
- trace->entries = &page_ext->trace_entries[0];
- trace->skip = 3;
- save_stack_trace(&page_ext->trace);
+ save_stack_trace(&trace);
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
+ page_ext->nr_entries = trace.nr_entries;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
int ret;
int pageblock_mt, page_mt;
char *kbuf;
+ struct stack_trace trace = {
+ .nr_entries = page_ext->nr_entries,
+ .entries = &page_ext->trace_entries[0],
+ };
kbuf = kmalloc(count, GFP_KERNEL);
if (!kbuf)
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
- ret += snprint_stack_trace(kbuf + ret, count - ret,
- &page_ext->trace, 0);
+ ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
if (ret >= count)
goto err;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b264bda46e1b..75c1f2878519 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
do {
again:
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd)) {
+ if (pmd_none(*pmd) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
@@ -59,7 +59,7 @@ again:
continue;
split_huge_page_pmd_mm(walk->mm, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
if (err)
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
break;
continue;
}
- if (walk->pud_entry)
- err = walk->pud_entry(pud, addr, next, walk);
- if (!err && (walk->pmd_entry || walk->pte_entry))
+ if (walk->pmd_entry || walk->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
@@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
return err;
}
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ int err = 0;
+
+ pgd = pgd_offset(walk->mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pmd_entry || walk->pte_entry)
+ err = walk_pud_range(pgd, addr, next, walk);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ return err;
+}
+
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
unsigned long end)
@@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
return boundary < end ? boundary : end;
}
-static int walk_hugetlb_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+ struct vm_area_struct *vma = walk->vma;
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
@@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
if (pte && walk->hugetlb_entry)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
if (err)
- return err;
+ break;
} while (addr = next, addr != end);
- return 0;
+ return err;
}
#else /* CONFIG_HUGETLB_PAGE */
-static int walk_hugetlb_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
return 0;
@@ -137,115 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
#endif /* CONFIG_HUGETLB_PAGE */
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ if (walk->test_walk)
+ return walk->test_walk(start, end, walk);
+
+ /*
+ * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
+ * range, so we don't walk over it as we do for normal vmas. However,
+ * Some callers are interested in handling hole range and they don't
+ * want to just ignore any single address range. Such users certainly
+ * define their ->pte_hole() callbacks, so let's delegate them to handle
+ * vma(VM_PFNMAP).
+ */
+ if (vma->vm_flags & VM_PFNMAP) {
+ int err = 1;
+ if (walk->pte_hole)
+ err = walk->pte_hole(start, end, walk);
+ return err ? err : 1;
+ }
+ return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ int err = 0;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (vma && is_vm_hugetlb_page(vma)) {
+ if (walk->hugetlb_entry)
+ err = walk_hugetlb_range(start, end, walk);
+ } else
+ err = walk_pgd_range(start, end, walk);
+ return err;
+}
/**
- * walk_page_range - walk a memory map's page tables with a callback
- * @addr: starting address
- * @end: ending address
- * @walk: set of callbacks to invoke for each level of the tree
+ * walk_page_range - walk page table with caller specific callbacks
*
- * Recursively walk the page table for the memory area in a VMA,
- * calling supplied callbacks. Callbacks are called in-order (first
- * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
- * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ * Recursively walk the page table tree of the process represented by @walk->mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ * - 0 : succeeded to handle the current entry, and if you don't reach the
+ * end address yet, continue to walk.
+ * - >0 : succeeded to handle the current entry, and return to the caller
+ * with caller specific value.
+ * - <0 : failed to handle the current entry, and return to the caller
+ * with error code.
*
- * Each callback receives an entry pointer and the start and end of the
- * associated range, and a copy of the original mm_walk for access to
- * the ->private or ->mm fields.
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * purpose.
*
- * Usually no locks are taken, but splitting transparent huge page may
- * take page table lock. And the bottom level iterator will map PTE
- * directories from highmem if necessary.
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @walk->private should be helpful.
*
- * If any callback returns a non-zero value, the walk is aborted and
- * the return value is propagated back to the caller. Otherwise 0 is returned.
- *
- * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
- * is !NULL.
+ * Locking:
+ * Callers of walk_page_range() and walk_page_vma() should hold
+ * @walk->mm->mmap_sem, because these function traverse vma list and/or
+ * access to vma's data.
*/
-int walk_page_range(unsigned long addr, unsigned long end,
+int walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- pgd_t *pgd;
- unsigned long next;
int err = 0;
+ unsigned long next;
+ struct vm_area_struct *vma;
- if (addr >= end)
- return err;
+ if (start >= end)
+ return -EINVAL;
if (!walk->mm)
return -EINVAL;
VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
- pgd = pgd_offset(walk->mm, addr);
+ vma = find_vma(walk->mm, start);
do {
- struct vm_area_struct *vma = NULL;
-
- next = pgd_addr_end(addr, end);
+ if (!vma) { /* after the last vma */
+ walk->vma = NULL;
+ next = end;
+ } else if (start < vma->vm_start) { /* outside vma */
+ walk->vma = NULL;
+ next = min(end, vma->vm_start);
+ } else { /* inside vma */
+ walk->vma = vma;
+ next = min(end, vma->vm_end);
+ vma = vma->vm_next;
- /*
- * This function was not intended to be vma based.
- * But there are vma special cases to be handled:
- * - hugetlb vma's
- * - VM_PFNMAP vma's
- */
- vma = find_vma(walk->mm, addr);
- if (vma) {
- /*
- * There are no page structures backing a VM_PFNMAP
- * range, so do not allow split_huge_page_pmd().
- */
- if ((vma->vm_start <= addr) &&
- (vma->vm_flags & VM_PFNMAP)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
- continue;
- }
- /*
- * Handle hugetlb vma individually because pagetable
- * walk for the hugetlb page is dependent on the
- * architecture and we can't handled it in the same
- * manner as non-huge pages.
- */
- if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
- is_vm_hugetlb_page(vma)) {
- if (vma->vm_end < next)
- next = vma->vm_end;
- /*
- * Hugepage is very tightly coupled with vma,
- * so walk through hugetlb entries within a
- * given vma.
- */
- err = walk_hugetlb_range(vma, addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
+ err = walk_page_test(start, next, walk);
+ if (err > 0)
continue;
- }
- }
-
- if (pgd_none_or_clear_bad(pgd)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
- if (err)
+ if (err < 0)
break;
- pgd++;
- continue;
}
- if (walk->pgd_entry)
- err = walk->pgd_entry(pgd, addr, next, walk);
- if (!err &&
- (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
- err = walk_pud_range(pgd, addr, next, walk);
+ if (walk->vma || walk->pte_hole)
+ err = __walk_page_range(start, next, walk);
if (err)
break;
- pgd++;
- } while (addr = next, addr < end);
-
+ } while (start = next, start < end);
return err;
}
+
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+{
+ int err;
+
+ if (!walk->mm)
+ return -EINVAL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ VM_BUG_ON(!vma);
+ walk->vma = vma;
+ err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+ if (err > 0)
+ return 0;
+ if (err < 0)
+ return err;
+ return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5077afcd9e11..b1597690530c 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
size_t bytes;
/* Get the pages we're interested in */
- down_read(&mm->mmap_sem);
- pages = get_user_pages(task, mm, pa, pages,
- vm_write, 0, process_pages, NULL);
- up_read(&mm->mmap_sem);
-
+ pages = get_user_pages_unlocked(task, mm, pa, pages,
+ vm_write, 0, process_pages);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/rmap.c b/mm/rmap.c
index 70b32498d4f2..5e3e09081164 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1085,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page,
void page_add_file_rmap(struct page *page)
{
struct mem_cgroup *memcg;
- unsigned long flags;
- bool locked;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ memcg = mem_cgroup_begin_page_stat(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_page_stat(memcg, &locked, &flags);
+ mem_cgroup_end_page_stat(memcg);
}
static void page_remove_file_rmap(struct page *page)
{
struct mem_cgroup *memcg;
- unsigned long flags;
- bool locked;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ memcg = mem_cgroup_begin_page_stat(page);
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
@@ -1123,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page)
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
out:
- mem_cgroup_end_page_stat(memcg, &locked, &flags);
+ mem_cgroup_end_page_stat(memcg);
}
/**
diff --git a/mm/shmem.c b/mm/shmem.c
index b3e403181981..864c878401e6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1131,7 +1131,7 @@ repeat:
* truncated or holepunched since swap was confirmed.
* shmem_undo_range() will have done some of the
* unaccounting, now delete_from_swap_cache() will do
- * the rest (including mem_cgroup_uncharge_swapcache).
+ * the rest.
* Reset swap.val? No, leave it so "failed" goes back to
* "repeat": reading a hole and writing should succeed.
*/
diff --git a/mm/util.c b/mm/util.c
index fec39d4509a9..f3ef639c4857 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -240,14 +240,8 @@ int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
- int ret;
-
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start, nr_pages,
- write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
-
- return ret;
+ return get_user_pages_unlocked(current, mm, start, nr_pages,
+ write, 0, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcd90c891d8e..8e645ee52045 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -91,6 +91,9 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
+ /* Can cgroups be reclaimed below their normal consumption range? */
+ unsigned int may_thrash:1;
+
unsigned int hibernation_mode:1;
/* One of the zones is ready for compaction */
@@ -1903,8 +1906,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
* latencies, so it's better to scan a minimum amount there as
* well.
*/
- if (current_is_kswapd() && !zone_reclaimable(zone))
- force_scan = true;
+ if (current_is_kswapd()) {
+ if (!zone_reclaimable(zone))
+ force_scan = true;
+ if (!mem_cgroup_lruvec_online(lruvec))
+ force_scan = true;
+ }
if (!global_reclaim(sc))
force_scan = true;
@@ -2290,6 +2297,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
struct lruvec *lruvec;
int swappiness;
+ if (mem_cgroup_low(root, memcg)) {
+ if (!sc->may_thrash)
+ continue;
+ mem_cgroup_events(memcg, MEMCG_LOW, 1);
+ }
+
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
@@ -2311,8 +2324,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
mem_cgroup_iter_break(root, memcg);
break;
}
- memcg = mem_cgroup_iter(root, memcg, &reclaim);
- } while (memcg);
+ } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
/*
* Shrink the slab caches in the same proportion that
@@ -2515,10 +2527,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
+ int initial_priority = sc->priority;
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
bool zones_reclaimable;
-
+retry:
delayacct_freepages_start();
if (global_reclaim(sc))
@@ -2568,6 +2581,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (sc->compaction_ready)
return 1;
+ /* Untapped cgroup reserves? Don't OOM, retry. */
+ if (!sc->may_thrash) {
+ sc->priority = initial_priority;
+ sc->may_thrash = 1;
+ goto retry;
+ }
+
/* Any of the zones still reclaimable? Don't OOM. */
if (zones_reclaimable)
return 1;
@@ -3175,7 +3195,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
- wake_up(&pgdat->pfmemalloc_wait);
+ wake_up_all(&pgdat->pfmemalloc_wait);
/*
* Fragmentation may mean that the system cannot be rebalanced
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9943e5fd74e6..4f5cd974e11a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1437,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w)
if (need_update(cpu) &&
cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
- schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
- __round_jiffies_relative(sysctl_stat_interval, cpu));
+ schedule_delayed_work_on(cpu,
+ &per_cpu(vmstat_work, cpu), 0);
put_online_cpus();
@@ -1452,7 +1452,7 @@ static void __init start_shepherd_timer(void)
int cpu;
for_each_possible_cpu(cpu)
- INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))