summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-28 16:36:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-28 16:36:48 -0700
commit1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9 (patch)
tree6d227487ca2cf391589c73af1c40ec7b7126feec /mm
parent6039b80eb50a893476fea7d56e86ed2d19290054 (diff)
parentc3486f5376696034d0fcbef8ba70c70cfcb26f51 (diff)
downloadlinux-stable-1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9.tar.gz
linux-stable-1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9.tar.bz2
linux-stable-1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9.zip
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "The rest of MM" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (101 commits) mm, compaction: simplify contended compaction handling mm, compaction: introduce direct compaction priority mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations mm, page_alloc: make THP-specific decisions more generic mm, page_alloc: restructure direct compaction handling in slowpath mm, page_alloc: don't retry initial attempt in slowpath mm, page_alloc: set alloc_flags only once in slowpath lib/stackdepot.c: use __GFP_NOWARN for stack allocations mm, kasan: switch SLUB to stackdepot, enable memory quarantine for SLUB mm, kasan: account for object redzone in SLUB's nearest_obj() mm: fix use-after-free if memory allocation failed in vma_adjust() zsmalloc: Delete an unnecessary check before the function call "iput" mm/memblock.c: fix index adjustment error in __next_mem_range_rev() mem-hotplug: alloc new page from a nearest neighbor node when mem-offline mm: optimize copy_page_to/from_iter_iovec mm: add cond_resched() to generic_swapfile_activate() Revert "mm, mempool: only set __GFP_NOMEMALLOC if there are free elements" mm, compaction: don't isolate PageWriteback pages in MIGRATE_SYNC_LIGHT mode mm: hwpoison: remove incorrect comments make __section_nr() more efficient ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c15
-rw-r--r--mm/compaction.c113
-rw-r--r--mm/filemap.c16
-rw-r--r--mm/huge_memory.c56
-rw-r--r--mm/hugetlb.c1
-rw-r--r--mm/internal.h16
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/kasan.c63
-rw-r--r--mm/kasan/kasan.h3
-rw-r--r--mm/kasan/report.c8
-rw-r--r--mm/khugepaged.c16
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/memblock.c61
-rw-r--r--mm/memcontrol.c260
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/memory_hotplug.c45
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mempool.c18
-rw-r--r--mm/migrate.c39
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap.c20
-rw-r--r--mm/oom_kill.c187
-rw-r--r--mm/page-writeback.c128
-rw-r--r--mm/page_alloc.c557
-rw-r--r--mm/page_idle.c4
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/rmap.c26
-rw-r--r--mm/shmem.c14
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slub.c59
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c76
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/util.c4
-rw-r--r--mm/vmscan.c1023
-rw-r--r--mm/vmstat.c417
-rw-r--r--mm/workingset.c62
-rw-r--r--mm/zsmalloc.c74
39 files changed, 1888 insertions, 1542 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3c81803b00a3..c0837845c17c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -681,7 +681,7 @@ config IDLE_PAGE_TRACKING
See Documentation/vm/idle_page_tracking.txt for more details.
config ZONE_DEVICE
- bool "Device memory (pmem, etc...) hotplug support" if EXPERT
+ bool "Device memory (pmem, etc...) hotplug support"
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ed173b8ae8f2..efe237742074 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -947,24 +947,24 @@ long congestion_wait(int sync, long timeout)
EXPORT_SYMBOL(congestion_wait);
/**
- * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
- * @zone: A zone to check if it is heavily congested
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
+ * @pgdat: A pgdat to check if it is heavily congested
* @sync: SYNC or ASYNC IO
* @timeout: timeout in jiffies
*
* In the event of a congested backing_dev (any backing_dev) and the given
- * @zone has experienced recent congestion, this waits for up to @timeout
+ * @pgdat has experienced recent congestion, this waits for up to @timeout
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absence of zone congestion, cond_resched() is called to yield
+ * In the absence of pgdat congestion, cond_resched() is called to yield
* the processor if necessary but otherwise does not sleep.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
* returned. return_value == timeout implies the function did not sleep.
*/
-long wait_iff_congested(struct zone *zone, int sync, long timeout)
+long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout)
{
long ret;
unsigned long start = jiffies;
@@ -973,12 +973,13 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
/*
* If there is no congestion, or heavy congestion is not being
- * encountered in the current zone, yield if necessary instead
+ * encountered in the current pgdat, yield if necessary instead
* of sleeping on the congestion queue
*/
if (atomic_read(&nr_wb_congested[sync]) == 0 ||
- !test_bit(ZONE_CONGESTED, &zone->flags)) {
+ !test_bit(PGDAT_CONGESTED, &pgdat->flags)) {
cond_resched();
+
/* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start);
if (ret < 0)
diff --git a/mm/compaction.c b/mm/compaction.c
index 64df5fe052db..9affb2908304 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -331,7 +331,7 @@ static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
{
if (cc->mode == MIGRATE_ASYNC) {
if (!spin_trylock_irqsave(lock, *flags)) {
- cc->contended = COMPACT_CONTENDED_LOCK;
+ cc->contended = true;
return false;
}
} else {
@@ -365,13 +365,13 @@ static bool compact_unlock_should_abort(spinlock_t *lock,
}
if (fatal_signal_pending(current)) {
- cc->contended = COMPACT_CONTENDED_SCHED;
+ cc->contended = true;
return true;
}
if (need_resched()) {
if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = COMPACT_CONTENDED_SCHED;
+ cc->contended = true;
return true;
}
cond_resched();
@@ -394,7 +394,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
/* async compaction aborts if contended */
if (need_resched()) {
if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = COMPACT_CONTENDED_SCHED;
+ cc->contended = true;
return true;
}
@@ -646,8 +646,8 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
list_for_each_entry(page, &cc->migratepages, lru)
count[!!page_is_file_cache(page)]++;
- mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
- mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
}
/* Similar to reclaim, but different enough that they don't share logic */
@@ -655,12 +655,12 @@ static bool too_many_isolated(struct zone *zone)
{
unsigned long active, inactive, isolated;
- inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_ANON);
- active = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_ACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
- zone_page_state(zone, NR_ISOLATED_ANON);
+ inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
+ node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
+ active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
+ node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
+ isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
+ node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
return isolated > (inactive + active) / 2;
}
@@ -752,7 +752,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* if contended.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&zone->lru_lock, flags,
+ && compact_unlock_should_abort(zone_lru_lock(zone), flags,
&locked, cc))
break;
@@ -813,7 +813,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&zone->lru_lock,
+ spin_unlock_irqrestore(zone_lru_lock(zone),
flags);
locked = false;
}
@@ -836,7 +836,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
- locked = compact_trylock_irqsave(&zone->lru_lock,
+ locked = compact_trylock_irqsave(zone_lru_lock(zone),
&flags, cc);
if (!locked)
break;
@@ -856,7 +856,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
}
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
@@ -899,7 +899,7 @@ isolate_fail:
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false;
}
acct_isolated(zone, cc);
@@ -927,7 +927,7 @@ isolate_fail:
low_pfn = end_pfn;
if (locked)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
/*
* Update the pageblock-skip information and cached scanner pfn,
@@ -1200,7 +1200,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
- (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
+ (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
/*
* Start at where we last stopped, or beginning of the zone as
@@ -1619,14 +1619,11 @@ out:
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
- if (ret == COMPACT_CONTENDED)
- ret = COMPACT_PARTIAL;
-
return ret;
}
static enum compact_result compact_zone_order(struct zone *zone, int order,
- gfp_t gfp_mask, enum migrate_mode mode, int *contended,
+ gfp_t gfp_mask, enum compact_priority prio,
unsigned int alloc_flags, int classzone_idx)
{
enum compact_result ret;
@@ -1636,7 +1633,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.order = order,
.gfp_mask = gfp_mask,
.zone = zone,
- .mode = mode,
+ .mode = (prio == COMPACT_PRIO_ASYNC) ?
+ MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
.direct_compaction = true,
@@ -1649,7 +1647,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
- *contended = cc.contended;
return ret;
}
@@ -1662,50 +1659,38 @@ int sysctl_extfrag_threshold = 500;
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
* @mode: The migration mode for async, sync light, or sync migration
- * @contended: Return value that determines if compaction was aborted due to
- * need_resched() or lock contention
*
* This is the main entry point for direct page compaction.
*/
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended)
+ enum compact_priority prio)
{
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
-
- *contended = COMPACT_CONTENDED_NONE;
/* Check if the GFP flags allow compaction */
- if (!order || !may_enter_fs || !may_perform_io)
+ if (!may_enter_fs || !may_perform_io)
return COMPACT_SKIPPED;
- trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
+ trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
enum compact_result status;
- int zone_contended;
if (compaction_deferred(zone, order)) {
rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
continue;
}
- status = compact_zone_order(zone, order, gfp_mask, mode,
- &zone_contended, alloc_flags,
- ac_classzone_idx(ac));
+ status = compact_zone_order(zone, order, gfp_mask, prio,
+ alloc_flags, ac_classzone_idx(ac));
rc = max(status, rc);
- /*
- * It takes at least one zone that wasn't lock contended
- * to clear all_zones_contended.
- */
- all_zones_contended &= zone_contended;
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
@@ -1717,59 +1702,29 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
* succeeds in this zone.
*/
compaction_defer_reset(zone, order, false);
- /*
- * It is possible that async compaction aborted due to
- * need_resched() and the watermarks were ok thanks to
- * somebody else freeing memory. The allocation can
- * however still fail so we better signal the
- * need_resched() contention anyway (this will not
- * prevent the allocation attempt).
- */
- if (zone_contended == COMPACT_CONTENDED_SCHED)
- *contended = COMPACT_CONTENDED_SCHED;
- goto break_loop;
+ break;
}
- if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
- status == COMPACT_PARTIAL_SKIPPED)) {
+ if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
+ status == COMPACT_PARTIAL_SKIPPED))
/*
* We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up
* succeeding after all, it will be reset.
*/
defer_compaction(zone, order);
- }
/*
* We might have stopped compacting due to need_resched() in
* async compaction, or due to a fatal signal detected. In that
- * case do not try further zones and signal need_resched()
- * contention.
- */
- if ((zone_contended == COMPACT_CONTENDED_SCHED)
- || fatal_signal_pending(current)) {
- *contended = COMPACT_CONTENDED_SCHED;
- goto break_loop;
- }
-
- continue;
-break_loop:
- /*
- * We might not have tried all the zones, so be conservative
- * and assume they are not all lock contended.
+ * case do not try further zones
*/
- all_zones_contended = 0;
- break;
+ if ((prio == COMPACT_PRIO_ASYNC && need_resched())
+ || fatal_signal_pending(current))
+ break;
}
- /*
- * If at least one zone wasn't deferred or skipped, we report if all
- * zones that were tried were lock contended.
- */
- if (rc > COMPACT_INACTIVE && all_zones_contended)
- *contended = COMPACT_CONTENDED_LOCK;
-
return rc;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index e90c1543ec2d..c5f5e46c6f7f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -95,8 +95,8 @@
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
- * ->zone.lru_lock (follow_page->mark_page_accessed)
- * ->zone.lru_lock (check_pte_range->isolate_lru_page)
+ * ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
+ * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
@@ -218,11 +218,11 @@ void __delete_from_page_cache(struct page *page, void *shadow)
/* hugetlb pages do not participate in page cache accounting. */
if (!PageHuge(page))
- __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr);
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
- __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr);
+ __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
- __dec_zone_page_state(page, NR_SHMEM_THPS);
+ __dec_node_page_state(page, NR_SHMEM_THPS);
} else {
VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
}
@@ -568,9 +568,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
* hugetlb pages do not participate in page cache accounting.
*/
if (!PageHuge(new))
- __inc_zone_page_state(new, NR_FILE_PAGES);
+ __inc_node_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
- __inc_zone_page_state(new, NR_SHMEM);
+ __inc_node_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
mem_cgroup_migrate(old, new);
radix_tree_preload_end();
@@ -677,7 +677,7 @@ static int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting. */
if (!huge)
- __inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_node_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3647334c2ef9..2373f0a7d340 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -539,23 +539,26 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
}
/*
- * If THP is set to always then directly reclaim/compact as necessary
- * If set to defer then do no reclaim and defer to khugepaged
+ * If THP defrag is set to always then directly reclaim/compact as necessary
+ * If set to defer then do only background reclaim/compact and defer to khugepaged
* If set to madvise and the VMA is flagged then directly reclaim/compact
+ * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
*/
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
{
- gfp_t reclaim_flags = 0;
-
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) &&
- (vma->vm_flags & VM_HUGEPAGE))
- reclaim_flags = __GFP_DIRECT_RECLAIM;
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
- reclaim_flags = __GFP_KSWAPD_RECLAIM;
- else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
- reclaim_flags = __GFP_DIRECT_RECLAIM;
-
- return GFP_TRANSHUGE | reclaim_flags;
+ bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+ &transparent_hugepage_flags) && vma_madvised)
+ return GFP_TRANSHUGE;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+ &transparent_hugepage_flags))
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ &transparent_hugepage_flags))
+ return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+ return GFP_TRANSHUGE_LIGHT;
}
/* Caller must hold page table lock. */
@@ -1249,25 +1252,26 @@ out:
return 0;
}
-int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+/*
+ * Return true if we do MADV_FREE successfully on entire pmd page.
+ * Otherwise, return false.
+ */
+bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, unsigned long next)
-
{
spinlock_t *ptl;
pmd_t orig_pmd;
struct page *page;
struct mm_struct *mm = tlb->mm;
- int ret = 0;
+ bool ret = false;
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
goto out_unlocked;
orig_pmd = *pmd;
- if (is_huge_zero_pmd(orig_pmd)) {
- ret = 1;
+ if (is_huge_zero_pmd(orig_pmd))
goto out;
- }
page = pmd_page(orig_pmd);
/*
@@ -1309,7 +1313,7 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
set_pmd_at(mm, addr, pmd, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- ret = 1;
+ ret = true;
out:
spin_unlock(ptl);
out_unlocked:
@@ -1586,7 +1590,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
- __dec_zone_page_state(page, NR_ANON_THPS);
+ __dec_node_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -1818,7 +1822,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
pgoff_t end = -1;
int i;
- lruvec = mem_cgroup_page_lruvec(head, zone);
+ lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
@@ -1848,7 +1852,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
spin_unlock(&head->mapping->tree_lock);
}
- spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head);
@@ -2034,7 +2038,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();
/* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
+ spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
if (mapping) {
void **pslot;
@@ -2061,7 +2065,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
list_del(page_deferred_list(head));
}
if (mapping)
- __dec_zone_page_state(page, NR_SHMEM_THPS);
+ __dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags);
ret = 0;
@@ -2077,7 +2081,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&pgdata->split_queue_lock);
fail: if (mapping)
spin_unlock(&mapping->tree_lock);
- spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head);
ret = -EBUSY;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51a04e5e9373..f904246a8fd5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4391,7 +4391,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
/*
* This function is called from memory failure code.
- * Assume the caller holds page lock of the head page.
*/
int dequeue_hwpoisoned_huge_page(struct page *hpage)
{
diff --git a/mm/internal.h b/mm/internal.h
index 9b6a6c43ac39..1501304f87a4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -78,7 +78,7 @@ extern unsigned long highest_memmap_pfn;
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern bool zone_reclaimable(struct zone *zone);
+extern bool pgdat_reclaimable(struct pglist_data *pgdat);
/*
* in mm/rmap.c:
@@ -185,10 +185,7 @@ struct compact_control {
const unsigned int alloc_flags; /* alloc flags of a direct compactor */
const int classzone_idx; /* zone index of a direct compactor */
struct zone *zone;
- int contended; /* Signal need_sched() or lock
- * contention detected during
- * compaction
- */
+ bool contended; /* Signal lock or sched contention */
};
unsigned long
@@ -433,10 +430,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-#define ZONE_RECLAIM_NOSCAN -2
-#define ZONE_RECLAIM_FULL -1
-#define ZONE_RECLAIM_SOME 0
-#define ZONE_RECLAIM_SUCCESS 1
+#define NODE_RECLAIM_NOSCAN -2
+#define NODE_RECLAIM_FULL -1
+#define NODE_RECLAIM_SOME 0
+#define NODE_RECLAIM_SUCCESS 1
extern int hwpoison_filter(struct page *p);
@@ -467,7 +464,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
-#define ALLOC_FAIR 0x100 /* fair zone allocation */
enum ttu_flags;
struct tlbflush_unmap_batch;
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1548749a3d45..2976a9ee104f 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -7,5 +7,4 @@ CFLAGS_REMOVE_kasan.o = -pg
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-obj-y := kasan.o report.o kasan_init.o
-obj-$(CONFIG_SLAB) += quarantine.o
+obj-y := kasan.o report.o kasan_init.o quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6845f9294696..b6f99e81bfeb 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -351,7 +351,6 @@ void kasan_free_pages(struct page *page, unsigned int order)
KASAN_FREE_PAGE);
}
-#ifdef CONFIG_SLAB
/*
* Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
* For larger allocations larger redzones are used.
@@ -373,16 +372,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
unsigned long *flags)
{
int redzone_adjust;
- /* Make sure the adjusted size is still less than
- * KMALLOC_MAX_CACHE_SIZE.
- * TODO: this check is only useful for SLAB, but not SLUB. We'll need
- * to skip it for SLUB when it starts using kasan_cache_create().
- */
- if (*size > KMALLOC_MAX_CACHE_SIZE -
- sizeof(struct kasan_alloc_meta) -
- sizeof(struct kasan_free_meta))
- return;
- *flags |= SLAB_KASAN;
+ int orig_size = *size;
+
/* Add alloc meta. */
cache->kasan_info.alloc_meta_offset = *size;
*size += sizeof(struct kasan_alloc_meta);
@@ -395,14 +386,26 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
}
redzone_adjust = optimal_redzone(cache->object_size) -
(*size - cache->object_size);
+
if (redzone_adjust > 0)
*size += redzone_adjust;
- *size = min(KMALLOC_MAX_CACHE_SIZE,
- max(*size,
- cache->object_size +
- optimal_redzone(cache->object_size)));
+
+ *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size +
+ optimal_redzone(cache->object_size)));
+
+ /*
+ * If the metadata doesn't fit, don't enable KASAN at all.
+ */
+ if (*size <= cache->kasan_info.alloc_meta_offset ||
+ *size <= cache->kasan_info.free_meta_offset) {
+ cache->kasan_info.alloc_meta_offset = 0;
+ cache->kasan_info.free_meta_offset = 0;
+ *size = orig_size;
+ return;
+ }
+
+ *flags |= SLAB_KASAN;
}
-#endif
void kasan_cache_shrink(struct kmem_cache *cache)
{
@@ -414,6 +417,14 @@ void kasan_cache_destroy(struct kmem_cache *cache)
quarantine_remove_cache(cache);
}
+size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+ return (cache->kasan_info.alloc_meta_offset ?
+ sizeof(struct kasan_alloc_meta) : 0) +
+ (cache->kasan_info.free_meta_offset ?
+ sizeof(struct kasan_free_meta) : 0);
+}
+
void kasan_poison_slab(struct page *page)
{
kasan_poison_shadow(page_address(page),
@@ -431,16 +442,13 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
kasan_poison_shadow(object,
round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
KASAN_KMALLOC_REDZONE);
-#ifdef CONFIG_SLAB
if (cache->flags & SLAB_KASAN) {
struct kasan_alloc_meta *alloc_info =
get_alloc_info(cache, object);
alloc_info->state = KASAN_STATE_INIT;
}
-#endif
}
-#ifdef CONFIG_SLAB
static inline int in_irqentry_text(unsigned long ptr)
{
return (ptr >= (unsigned long)&__irqentry_text_start &&
@@ -501,7 +509,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
return (void *)object + cache->kasan_info.free_meta_offset;
}
-#endif
void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
{
@@ -522,16 +529,16 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
bool kasan_slab_free(struct kmem_cache *cache, void *object)
{
-#ifdef CONFIG_SLAB
/* RCU slabs could be legally used after free within the RCU period */
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
return false;
if (likely(cache->flags & SLAB_KASAN)) {
- struct kasan_alloc_meta *alloc_info =
- get_alloc_info(cache, object);
- struct kasan_free_meta *free_info =
- get_free_info(cache, object);
+ struct kasan_alloc_meta *alloc_info;
+ struct kasan_free_meta *free_info;
+
+ alloc_info = get_alloc_info(cache, object);
+ free_info = get_free_info(cache, object);
switch (alloc_info->state) {
case KASAN_STATE_ALLOC:
@@ -550,10 +557,6 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
}
}
return false;
-#else
- kasan_poison_slab_free(cache, object);
- return false;
-#endif
}
void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -576,7 +579,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
kasan_unpoison_shadow(object, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_KMALLOC_REDZONE);
-#ifdef CONFIG_SLAB
if (cache->flags & SLAB_KASAN) {
struct kasan_alloc_meta *alloc_info =
get_alloc_info(cache, object);
@@ -585,7 +587,6 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
alloc_info->alloc_size = size;
set_track(&alloc_info->track, flags);
}
-#endif
}
EXPORT_SYMBOL(kasan_kmalloc);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index fb87923552ef..31972cdba433 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -95,7 +95,6 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
const void *object);
-
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
@@ -110,7 +109,7 @@ static inline bool kasan_report_enabled(void)
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
-#ifdef CONFIG_SLAB
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
void quarantine_reduce(void);
void quarantine_remove_cache(struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b3c122ddd454..861b9776841a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -116,7 +116,6 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-#ifdef CONFIG_SLAB
static void print_track(struct kasan_track *track)
{
pr_err("PID = %u\n", track->pid);
@@ -130,8 +129,8 @@ static void print_track(struct kasan_track *track)
}
}
-static void object_err(struct kmem_cache *cache, struct page *page,
- void *object, char *unused_reason)
+static void kasan_object_err(struct kmem_cache *cache, struct page *page,
+ void *object, char *unused_reason)
{
struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
struct kasan_free_meta *free_info;
@@ -162,7 +161,6 @@ static void object_err(struct kmem_cache *cache, struct page *page,
break;
}
}
-#endif
static void print_address_description(struct kasan_access_info *info)
{
@@ -177,7 +175,7 @@ static void print_address_description(struct kasan_access_info *info)
struct kmem_cache *cache = page->slab_cache;
object = nearest_obj(cache, page,
(void *)info->access_addr);
- object_err(cache, page, object,
+ kasan_object_err(cache, page, object,
"kasan: bad access detected");
return;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7dbee698d6aa..79c52d0061af 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -480,7 +480,7 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
/* 0 stands for page_is_file_cache(page) == false */
- dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ dec_node_page_state(page, NR_ISOLATED_ANON + 0);
unlock_page(page);
putback_lru_page(page);
}
@@ -576,7 +576,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
- inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ inc_node_page_state(page, NR_ISOLATED_ANON + 0);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -672,10 +672,10 @@ static bool khugepaged_scan_abort(int nid)
int i;
/*
- * If zone_reclaim_mode is disabled, then no extra effort is made to
+ * If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
- if (!zone_reclaim_mode)
+ if (!node_reclaim_mode)
return false;
/* If there is a count for this node already, it must be acceptable */
@@ -694,7 +694,7 @@ static bool khugepaged_scan_abort(int nid)
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
- return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
+ return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}
#ifdef CONFIG_NUMA
@@ -1483,10 +1483,10 @@ tree_unlocked:
}
local_irq_save(flags);
- __inc_zone_page_state(new_page, NR_SHMEM_THPS);
+ __inc_node_page_state(new_page, NR_SHMEM_THPS);
if (nr_none) {
- __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none);
- __mod_zone_page_state(zone, NR_SHMEM, nr_none);
+ __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
+ __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
}
local_irq_restore(flags);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 04320d3adbef..086292f7c59d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1485,8 +1485,10 @@ static int kmemleak_scan_thread(void *arg)
* Wait before the first scan to allow the system to fully initialize.
*/
if (first_run) {
+ signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000);
first_run = 0;
- ssleep(SECS_FIRST_SCAN);
+ while (timeout && !kthread_should_stop())
+ timeout = schedule_timeout_interruptible(timeout);
}
while (!kthread_should_stop()) {
diff --git a/mm/memblock.c b/mm/memblock.c
index ca099159b45a..ff5ff3b5f1ea 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,7 +20,7 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
-#include <asm-generic/sections.h>
+#include <asm/sections.h>
#include <linux/io.h>
#include "internal.h"
@@ -1027,7 +1027,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
*out_end = m_end;
if (out_nid)
*out_nid = m_nid;
- idx_a++;
+ idx_a--;
*idx = (u32)idx_a | (u64)idx_b << 32;
return;
}
@@ -1465,15 +1465,16 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
}
-void __init memblock_enforce_memory_limit(phys_addr_t limit)
+static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
{
phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
struct memblock_region *r;
- if (!limit)
- return;
-
- /* find out max address */
+ /*
+ * translate the memory @limit size into the max address within one of
+ * the memory memblock regions, if the @limit exceeds the total size
+ * of those regions, max_addr will keep original value ULLONG_MAX
+ */
for_each_memblock(memory, r) {
if (limit <= r->size) {
max_addr = r->base + limit;
@@ -1482,6 +1483,22 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
limit -= r->size;
}
+ return max_addr;
+}
+
+void __init memblock_enforce_memory_limit(phys_addr_t limit)
+{
+ phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+
+ if (!limit)
+ return;
+
+ max_addr = __find_max_addr(limit);
+
+ /* @limit exceeds the total size of the memory, do nothing */
+ if (max_addr == (phys_addr_t)ULLONG_MAX)
+ return;
+
/* truncate both memory and reserved regions */
memblock_remove_range(&memblock.memory, max_addr,
(phys_addr_t)ULLONG_MAX);
@@ -1489,6 +1506,36 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
(phys_addr_t)ULLONG_MAX);
}
+void __init memblock_mem_limit_remove_map(phys_addr_t limit)
+{
+ struct memblock_type *type = &memblock.memory;
+ phys_addr_t max_addr;
+ int i, ret, start_rgn, end_rgn;
+
+ if (!limit)
+ return;
+
+ max_addr = __find_max_addr(limit);
+
+ /* @limit exceeds the total size of the memory, do nothing */
+ if (max_addr == (phys_addr_t)ULLONG_MAX)
+ return;
+
+ ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX,
+ &start_rgn, &end_rgn);
+ if (ret)
+ return;
+
+ /* remove all the MAP regions above the limit */
+ for (i = end_rgn - 1; i >= start_rgn; i--) {
+ if (!memblock_is_nomap(&type->regions[i]))
+ memblock_remove_region(type, i);
+ }
+ /* truncate the reserved regions */
+ memblock_remove_range(&memblock.reserved, max_addr,
+ (phys_addr_t)ULLONG_MAX);
+}
+
static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
{
unsigned int left = 0, right = type->cnt;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3a84c64f35c..c265212bec8c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = {
* their hierarchy representation
*/
-struct mem_cgroup_tree_per_zone {
+struct mem_cgroup_tree_per_node {
struct rb_root rb_root;
spinlock_t lock;
};
-struct mem_cgroup_tree_per_node {
- struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
-};
-
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};
@@ -323,15 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
#endif /* !CONFIG_SLOB */
-static struct mem_cgroup_per_zone *
-mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
-{
- int nid = zone_to_nid(zone);
- int zid = zone_idx(zone);
-
- return &memcg->nodeinfo[nid]->zoneinfo[zid];
-}
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -383,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page)
return ino;
}
-static struct mem_cgroup_per_zone *
-mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
+static struct mem_cgroup_per_node *
+mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
{
int nid = page_to_nid(page);
- int zid = page_zonenum(page);
- return &memcg->nodeinfo[nid]->zoneinfo[zid];
+ return memcg->nodeinfo[nid];
}
-static struct mem_cgroup_tree_per_zone *
-soft_limit_tree_node_zone(int nid, int zid)
+static struct mem_cgroup_tree_per_node *
+soft_limit_tree_node(int nid)
{
- return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+ return soft_limit_tree.rb_tree_per_node[nid];
}
-static struct mem_cgroup_tree_per_zone *
+static struct mem_cgroup_tree_per_node *
soft_limit_tree_from_page(struct page *page)
{
int nid = page_to_nid(page);
- int zid = page_zonenum(page);
- return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
+ return soft_limit_tree.rb_tree_per_node[nid];
}
-static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz,
+static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
+ struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
- struct mem_cgroup_per_zone *mz_node;
+ struct mem_cgroup_per_node *mz_node;
if (mz->on_tree)
return;
@@ -423,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
return;
while (*p) {
parent = *p;
- mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
+ mz_node = rb_entry(parent, struct mem_cgroup_per_node,
tree_node);
if (mz->usage_in_excess < mz_node->usage_in_excess)
p = &(*p)->rb_left;
@@ -439,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
mz->on_tree = true;
}
-static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
+ struct mem_cgroup_tree_per_node *mctz)
{
if (!mz->on_tree)
return;
@@ -448,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
mz->on_tree = false;
}
-static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
+ struct mem_cgroup_tree_per_node *mctz)
{
unsigned long flags;
@@ -473,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
unsigned long excess;
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup_tree_per_zone *mctz;
+ struct mem_cgroup_per_node *mz;
+ struct mem_cgroup_tree_per_node *mctz;
mctz = soft_limit_tree_from_page(page);
/*
@@ -482,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- mz = mem_cgroup_page_zoneinfo(memcg, page);
+ mz = mem_cgroup_page_nodeinfo(memcg, page);
excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
@@ -507,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
- struct mem_cgroup_tree_per_zone *mctz;
- struct mem_cgroup_per_zone *mz;
- int nid, zid;
+ struct mem_cgroup_tree_per_node *mctz;
+ struct mem_cgroup_per_node *mz;
+ int nid;
for_each_node(nid) {
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
- mctz = soft_limit_tree_node_zone(nid, zid);
- mem_cgroup_remove_exceeded(mz, mctz);
- }
+ mz = mem_cgroup_nodeinfo(memcg, nid);
+ mctz = soft_limit_tree_node(nid);
+ mem_cgroup_remove_exceeded(mz, mctz);
}
}
-static struct mem_cgroup_per_zone *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+static struct mem_cgroup_per_node *
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
struct rb_node *rightmost = NULL;
- struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_per_node *mz;
retry:
mz = NULL;
@@ -532,7 +515,7 @@ retry:
if (!rightmost)
goto done; /* Nothing to reclaim from */
- mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
+ mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
@@ -546,10 +529,10 @@ done:
return mz;
}
-static struct mem_cgroup_per_zone *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
+static struct mem_cgroup_per_node *
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
- struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_per_node *mz;
spin_lock_irq(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
@@ -643,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
unsigned long nr = 0;
- int zid;
+ struct mem_cgroup_per_node *mz;
+ enum lru_list lru;
VM_BUG_ON((unsigned)nid >= nr_node_ids);
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- struct mem_cgroup_per_zone *mz;
- enum lru_list lru;
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
- nr += mz->lru_size[lru];
- }
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ mz = mem_cgroup_nodeinfo(memcg, nid);
+ nr += mz->lru_size[lru];
}
return nr;
}
@@ -809,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
rcu_read_lock();
if (reclaim) {
- struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_per_node *mz;
- mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+ mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
iter = &mz->iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation)
@@ -910,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup_reclaim_iter *iter;
- struct mem_cgroup_per_zone *mz;
- int nid, zid;
+ struct mem_cgroup_per_node *mz;
+ int nid;
int i;
while ((memcg = parent_mem_cgroup(memcg))) {
for_each_node(nid) {
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ mz = mem_cgroup_nodeinfo(memcg, nid);
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
}
}
}
@@ -944,39 +921,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
iter = mem_cgroup_iter(NULL, iter, NULL))
/**
- * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
- * @zone: zone of the wanted lruvec
- * @memcg: memcg of the wanted lruvec
- *
- * Returns the lru list vector holding pages for the given @zone and
- * @mem. This can be the global zone lruvec, if the memory controller
- * is disabled.
- */
-struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
- struct mem_cgroup *memcg)
-{
- struct mem_cgroup_per_zone *mz;
- struct lruvec *lruvec;
-
- if (mem_cgroup_disabled()) {
- lruvec = &zone->lruvec;
- goto out;
- }
-
- mz = mem_cgroup_zone_zoneinfo(memcg, zone);
- lruvec = &mz->lruvec;
-out:
- /*
- * Since a node can be onlined after the mem_cgroup was created,
- * we have to be prepared to initialize lruvec->zone here;
- * and if offlined then reonlined, we need to reinitialize it.
- */
- if (unlikely(lruvec->zone != zone))
- lruvec->zone = zone;
- return lruvec;
-}
-
-/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
* @zone: zone of the page
@@ -985,14 +929,14 @@ out:
* and putback protocol: the LRU lock must be held, and the page must
* either be PageLRU() or the caller must have isolated/allocated it.
*/
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
{
- struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_per_node *mz;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
- lruvec = &zone->lruvec;
+ lruvec = &pgdat->lruvec;
goto out;
}
@@ -1004,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
if (!memcg)
memcg = root_mem_cgroup;
- mz = mem_cgroup_page_zoneinfo(memcg, page);
+ mz = mem_cgroup_page_nodeinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
@@ -1012,8 +956,8 @@ out:
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
- if (unlikely(lruvec->zone != zone))
- lruvec->zone = zone;
+ if (unlikely(lruvec->pgdat != pgdat))
+ lruvec->pgdat = pgdat;
return lruvec;
}
@@ -1030,17 +974,15 @@ out:
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages)
{
- struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
long size;
bool empty;
- __update_lru_size(lruvec, lru, nr_pages);
-
if (mem_cgroup_disabled())
return;
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
lru_size = mz->lru_size + lru;
empty = list_empty(lruvec->lists + lru);
@@ -1276,9 +1218,9 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
- if (fatal_signal_pending(current) || task_will_free_mem(current)) {
+ if (task_will_free_mem(current)) {
mark_oom_victim(current);
- try_oom_reaper(current);
+ wake_oom_reaper(current);
goto unlock;
}
@@ -1433,7 +1375,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
#endif
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
- struct zone *zone,
+ pg_data_t *pgdat,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
@@ -1443,7 +1385,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
- .zone = zone,
+ .pgdat = pgdat,
.priority = 0,
};
@@ -1473,8 +1415,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
}
continue;
}
- total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
- zone, &nr_scanned);
+ total += mem_cgroup_shrink_node(victim, gfp_mask, false,
+ pgdat, &nr_scanned);
*total_scanned += nr_scanned;
if (!soft_limit_excess(root_memcg))
break;
@@ -2107,11 +2049,11 @@ static void lock_page_lru(struct page *page, int *isolated)
{
struct zone *zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
if (PageLRU(page)) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_lru(page));
*isolated = 1;
@@ -2126,12 +2068,12 @@ static void unlock_page_lru(struct page *page, int isolated)
if (isolated) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
static void commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -2431,7 +2373,7 @@ void memcg_kmem_uncharge(struct page *page, int order)
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock and migration entries setup in all page mappings.
+ * zone_lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
@@ -2601,22 +2543,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
return ret;
}
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
unsigned long nr_reclaimed = 0;
- struct mem_cgroup_per_zone *mz, *next_mz = NULL;
+ struct mem_cgroup_per_node *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
- struct mem_cgroup_tree_per_zone *mctz;
+ struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
unsigned long nr_scanned;
if (order > 0)
return 0;
- mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
+ mctz = soft_limit_tree_node(pgdat->node_id);
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
@@ -2631,7 +2573,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break;
nr_scanned = 0;
- reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
+ reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
gfp_mask, &nr_scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
@@ -3252,22 +3194,21 @@ static int memcg_stat_show(struct seq_file *m, void *v)
#ifdef CONFIG_DEBUG_VM
{
- int nid, zid;
- struct mem_cgroup_per_zone *mz;
+ pg_data_t *pgdat;
+ struct mem_cgroup_per_node *mz;
struct zone_reclaim_stat *rstat;
unsigned long recent_rotated[2] = {0, 0};
unsigned long recent_scanned[2] = {0, 0};
- for_each_online_node(nid)
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
- rstat = &mz->lruvec.reclaim_stat;
+ for_each_online_pgdat(pgdat) {
+ mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ rstat = &mz->lruvec.reclaim_stat;
- recent_rotated[0] += rstat->recent_rotated[0];
- recent_rotated[1] += rstat->recent_rotated[1];
- recent_scanned[0] += rstat->recent_scanned[0];
- recent_scanned[1] += rstat->recent_scanned[1];
- }
+ recent_rotated[0] += rstat->recent_rotated[0];
+ recent_rotated[1] += rstat->recent_rotated[1];
+ recent_scanned[0] += rstat->recent_scanned[0];
+ recent_scanned[1] += rstat->recent_scanned[1];
+ }
seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
@@ -4147,11 +4088,10 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return idr_find(&mem_cgroup_idr, id);
}
-static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
+static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- struct mem_cgroup_per_zone *mz;
- int zone, tmp = node;
+ int tmp = node;
/*
* This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node.
@@ -4166,18 +4106,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- mz = &pn->zoneinfo[zone];
- lruvec_init(&mz->lruvec);
- mz->usage_in_excess = 0;
- mz->on_tree = false;
- mz->memcg = memcg;
- }
+ lruvec_init(&pn->lruvec);
+ pn->usage_in_excess = 0;
+ pn->on_tree = false;
+ pn->memcg = memcg;
+
memcg->nodeinfo[node] = pn;
return 0;
}
-static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
+static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
kfree(memcg->nodeinfo[node]);
}
@@ -4188,7 +4126,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
memcg_wb_domain_exit(memcg);
for_each_node(node)
- free_mem_cgroup_per_zone_info(memcg, node);
+ free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat);
kfree(memcg);
}
@@ -4217,7 +4155,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
for_each_node(node)
- if (alloc_mem_cgroup_per_zone_info(memcg, node))
+ if (alloc_mem_cgroup_per_node_info(memcg, node))
goto fail;
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
@@ -5233,7 +5171,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "file %llu\n",
(u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n",
- (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+ (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n",
(u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
@@ -5820,18 +5758,12 @@ static int __init mem_cgroup_init(void)
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
- int zone;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
node_online(node) ? node : NUMA_NO_NODE);
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- struct mem_cgroup_tree_per_zone *rtpz;
-
- rtpz = &rtpn->rb_tree_per_zone[zone];
- rtpz->rb_root = RB_ROOT;
- spin_lock_init(&rtpz->lock);
- }
+ rtpn->rb_root = RB_ROOT;
+ spin_lock_init(&rtpn->lock);
soft_limit_tree.rb_tree_per_node[node] = rtpn;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2fcca6b0e005..de88f33519c0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -741,8 +741,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* page->lru because it can be used in other hugepage operations,
* such as __unmap_hugepage_range() and gather_surplus_pages().
* So instead we use page_mapping() and PageAnon().
- * We assume that this function is called with page lock held,
- * so there is no race between isolation and mapping/unmapping.
*/
if (!(page_mapping(hpage) || PageAnon(hpage))) {
res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1663,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags)
put_hwpoison_page(page);
if (!ret) {
LIST_HEAD(pagelist);
- inc_zone_page_state(page, NR_ISOLATED_ANON +
+ inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
@@ -1671,7 +1669,7 @@ static int __soft_offline_page(struct page *page, int flags)
if (ret) {
if (!list_empty(&pagelist)) {
list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
+ dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 82d0b98d27f8..3894b65b1555 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1209,9 +1209,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
arch_refresh_nodedata(nid, pgdat);
} else {
- /* Reset the nr_zones and classzone_idx to 0 before reuse */
+ /* Reset the nr_zones, order and classzone_idx before reuse */
pgdat->nr_zones = 0;
- pgdat->classzone_idx = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_classzone_idx = 0;
}
/* we can use NODE_DATA(nid) from here */
@@ -1547,6 +1548,37 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
return 0;
}
+static struct page *new_node_page(struct page *page, unsigned long private,
+ int **result)
+{
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+ int nid = page_to_nid(page);
+ nodemask_t nmask = node_online_map;
+ struct page *new_page;
+
+ /*
+ * TODO: allocate a destination hugepage from a nearest neighbor node,
+ * accordance with memory policy of the user process if possible. For
+ * now as a simple work-around, we use the next node for destination.
+ */
+ if (PageHuge(page))
+ return alloc_huge_page_node(page_hstate(compound_head(page)),
+ next_node_in(nid, nmask));
+
+ node_clear(nid, nmask);
+ if (PageHighMem(page)
+ || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ new_page = __alloc_pages_nodemask(gfp_mask, 0,
+ node_zonelist(nid, gfp_mask), &nmask);
+ if (!new_page)
+ new_page = __alloc_pages(gfp_mask, 0,
+ node_zonelist(nid, gfp_mask));
+
+ return new_page;
+}
+
#define NR_OFFLINE_AT_ONCE_PAGES (256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -1586,7 +1618,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
put_page(page);
list_add_tail(&page->lru, &source);
move_pages--;
- inc_zone_page_state(page, NR_ISOLATED_ANON +
+ inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
} else {
@@ -1610,11 +1642,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
goto out;
}
- /*
- * alloc_migrate_target should be improooooved!!
- * migrate_pages returns # of failed pages.
- */
- ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
+ /* Allocate a new page from the nearest neighbor node */
+ ret = migrate_pages(&source, new_node_page, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret)
putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 53e40d3f3933..d8c4e38fb5f4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -962,7 +962,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
if (!isolate_lru_page(page)) {
list_add_tail(&page->lru, pagelist);
- inc_zone_page_state(page, NR_ISOLATED_ANON +
+ inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
}
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 8f65464da5de..47a659dedd44 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -306,7 +306,7 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
- * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported.
+ * Note: using __GFP_ZERO is not supported.
*/
void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
@@ -315,27 +315,16 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
wait_queue_t wait;
gfp_t gfp_temp;
- /* If oom killed, memory reserves are essential to prevent livelock */
- VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
- /* No element size to zero on allocation */
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
-
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+ gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
- if (likely(pool->curr_nr)) {
- /*
- * Don't allocate from emergency reserves if there are
- * elements available. This check is racy, but it will
- * be rechecked each loop.
- */
- gfp_temp |= __GFP_NOMEMALLOC;
- }
element = pool->alloc(gfp_temp, pool->pool_data);
if (likely(element != NULL))
@@ -359,12 +348,11 @@ repeat_alloc:
* We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
- if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) {
+ if (gfp_temp != gfp_mask) {
spin_unlock_irqrestore(&pool->lock, flags);
gfp_temp = gfp_mask;
goto repeat_alloc;
}
- gfp_temp = gfp_mask;
/* We must not sleep if !__GFP_DIRECT_RECLAIM */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 2232f6923cc7..f7ee04a5ae27 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -168,7 +168,7 @@ void putback_movable_pages(struct list_head *l)
continue;
}
list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
+ dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
/*
* We isolated non-lru movable page so here we can use
@@ -501,19 +501,21 @@ int migrate_page_move_mapping(struct address_space *mapping,
* new page and drop references to the old page.
*
* Note that anonymous pages are accounted for
- * via NR_FILE_PAGES and NR_ANON_PAGES if they
+ * via NR_FILE_PAGES and NR_ANON_MAPPED if they
* are mapped to swap space.
*/
if (newzone != oldzone) {
- __dec_zone_state(oldzone, NR_FILE_PAGES);
- __inc_zone_state(newzone, NR_FILE_PAGES);
+ __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
+ __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_zone_state(oldzone, NR_SHMEM);
- __inc_zone_state(newzone, NR_SHMEM);
+ __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
+ __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
}
if (dirty && mapping_cap_account_dirty(mapping)) {
- __dec_zone_state(oldzone, NR_FILE_DIRTY);
- __inc_zone_state(newzone, NR_FILE_DIRTY);
+ __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
+ __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
+ __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
+ __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
}
}
local_irq_enable();
@@ -1119,7 +1121,7 @@ out:
* restored.
*/
list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
+ dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
}
@@ -1460,7 +1462,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
err = isolate_lru_page(page);
if (!err) {
list_add_tail(&page->lru, &pagelist);
- inc_zone_page_state(page, NR_ISOLATED_ANON +
+ inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
}
put_and_set:
@@ -1726,15 +1728,16 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
unsigned long nr_migrate_pages)
{
int z;
+
+ if (!pgdat_reclaimable(pgdat))
+ return false;
+
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
if (!populated_zone(zone))
continue;
- if (!zone_reclaimable(zone))
- continue;
-
/* Avoid waking kswapd by allocating pages_to_migrate pages. */
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone) +
@@ -1828,7 +1831,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
}
page_lru = page_is_file_cache(page);
- mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
hpage_nr_pages(page));
/*
@@ -1886,7 +1889,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
+ dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
}
@@ -1931,7 +1934,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
goto out_dropref;
new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
+ (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER);
if (!new_page)
goto out_fail;
@@ -1979,7 +1982,7 @@ fail_putback:
/* Retake the callers reference and putback on LRU */
get_page(page);
putback_lru_page(page);
- mod_zone_page_state(page_zone(page),
+ mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
goto out_unlock;
@@ -2030,7 +2033,7 @@ fail_putback:
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
- mod_zone_page_state(page_zone(page),
+ mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_lru,
-HPAGE_PMD_NR);
return isolated;
diff --git a/mm/mlock.c b/mm/mlock.c
index ef8dc9f395c4..14645be06e30 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -103,7 +103,7 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
if (PageLRU(page)) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+ lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
if (getpage)
get_page(page);
ClearPageLRU(page);
@@ -188,7 +188,7 @@ unsigned int munlock_vma_page(struct page *page)
* might otherwise copy PageMlocked to part of the tail pages before
* we clear it in the head page. It also stabilizes hpage_nr_pages().
*/
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
nr_pages = hpage_nr_pages(page);
if (!TestClearPageMlocked(page))
@@ -197,14 +197,14 @@ unsigned int munlock_vma_page(struct page *page)
__mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
if (__munlock_isolate_lru_page(page, true)) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
__munlock_isolated_page(page);
goto out;
}
__munlock_isolation_failed(page);
unlock_out:
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
out:
return nr_pages - 1;
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_init(&pvec_putback, 0);
/* Phase 1: page isolation */
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
@@ -315,7 +315,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
}
delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
diff --git a/mm/mmap.c b/mm/mmap.c
index 86b18f334f4f..d44bee96a5fe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -621,7 +621,6 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next;
- struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
struct rb_root *root = NULL;
struct anon_vma *anon_vma = NULL;
@@ -631,17 +630,25 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
int remove_next = 0;
if (next && !insert) {
- struct vm_area_struct *exporter = NULL;
+ struct vm_area_struct *exporter = NULL, *importer = NULL;
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
* perhaps the one after too (mprotect case 6).
*/
-again: remove_next = 1 + (end > next->vm_end);
+ remove_next = 1 + (end > next->vm_end);
end = next->vm_end;
exporter = next;
importer = vma;
+
+ /*
+ * If next doesn't have anon_vma, import from vma after
+ * next, if the vma overlaps with it.
+ */
+ if (remove_next == 2 && next && !next->anon_vma)
+ exporter = next->vm_next;
+
} else if (end > next->vm_start) {
/*
* vma expands, overlapping part of the next:
@@ -675,7 +682,7 @@ again: remove_next = 1 + (end > next->vm_end);
return error;
}
}
-
+again:
vma_adjust_trans_huge(vma, start, end, adjust_next);
if (file) {
@@ -796,8 +803,11 @@ again: remove_next = 1 + (end > next->vm_end);
* up the code too much to do both in one go.
*/
next = vma->vm_next;
- if (remove_next == 2)
+ if (remove_next == 2) {
+ remove_next = 1;
+ end = next->vm_end;
goto again;
+ }
else if (next)
vma_gap_update(next);
else
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d4a929d79470..7d0a275df822 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -176,11 +176,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
/*
* Do not even consider tasks which are explicitly marked oom
- * unkillable or have been already oom reaped.
+ * unkillable or have been already oom reaped or the are in
+ * the middle of vfork
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
- test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
+ test_bit(MMF_OOM_REAPED, &p->mm->flags) ||
+ in_vfork(p)) {
task_unlock(p);
return 0;
}
@@ -281,10 +283,22 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
/*
* This task already has access to memory reserves and is being killed.
- * Don't allow any other task to have access to the reserves.
+ * Don't allow any other task to have access to the reserves unless
+ * the task has MMF_OOM_REAPED because chances that it would release
+ * any memory is quite low.
*/
- if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
- return OOM_SCAN_ABORT;
+ if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) {
+ struct task_struct *p = find_lock_task_mm(task);
+ enum oom_scan_t ret = OOM_SCAN_ABORT;
+
+ if (p) {
+ if (test_bit(MMF_OOM_REAPED, &p->mm->flags))
+ ret = OOM_SCAN_CONTINUE;
+ task_unlock(p);
+ }
+
+ return ret;
+ }
/*
* If task is allocating a lot of memory and has been marked to be
@@ -415,7 +429,7 @@ bool oom_killer_disabled __read_mostly;
* task's threads: if one of those is using this mm then this task was also
* using it.
*/
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
{
struct task_struct *t;
@@ -554,8 +568,27 @@ static void oom_reap_task(struct task_struct *tsk)
schedule_timeout_idle(HZ/10);
if (attempts > MAX_OOM_REAP_RETRIES) {
+ struct task_struct *p;
+
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
+
+ /*
+ * If we've already tried to reap this task in the past and
+ * failed it probably doesn't make much sense to try yet again
+ * so hide the mm from the oom killer so that it can move on
+ * to another task with a different mm struct.
+ */
+ p = find_lock_task_mm(tsk);
+ if (p) {
+ if (test_and_set_bit(MMF_OOM_NOT_REAPABLE, &p->mm->flags)) {
+ pr_info("oom_reaper: giving up pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ set_bit(MMF_OOM_REAPED, &p->mm->flags);
+ }
+ task_unlock(p);
+ }
+
debug_show_all_locks();
}
@@ -594,7 +627,7 @@ static int oom_reaper(void *unused)
return 0;
}
-static void wake_oom_reaper(struct task_struct *tsk)
+void wake_oom_reaper(struct task_struct *tsk)
{
if (!oom_reaper_th)
return;
@@ -612,46 +645,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
wake_up(&oom_reaper_wait);
}
-/* Check if we can reap the given task. This has to be called with stable
- * tsk->mm
- */
-void try_oom_reaper(struct task_struct *tsk)
-{
- struct mm_struct *mm = tsk->mm;
- struct task_struct *p;
-
- if (!mm)
- return;
-
- /*
- * There might be other threads/processes which are either not
- * dying or even not killable.
- */
- if (atomic_read(&mm->mm_users) > 1) {
- rcu_read_lock();
- for_each_process(p) {
- if (!process_shares_mm(p, mm))
- continue;
- if (fatal_signal_pending(p))
- continue;
-
- /*
- * If the task is exiting make sure the whole thread group
- * is exiting and cannot acces mm anymore.
- */
- if (signal_group_exit(p->signal))
- continue;
-
- /* Give up */
- rcu_read_unlock();
- return;
- }
- rcu_read_unlock();
- }
-
- wake_oom_reaper(tsk);
-}
-
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -663,10 +656,6 @@ static int __init oom_init(void)
return 0;
}
subsys_initcall(oom_init)
-#else
-static void wake_oom_reaper(struct task_struct *tsk)
-{
-}
#endif
/**
@@ -743,6 +732,80 @@ void oom_killer_enable(void)
oom_killer_disabled = false;
}
+static inline bool __task_will_free_mem(struct task_struct *task)
+{
+ struct signal_struct *sig = task->signal;
+
+ /*
+ * A coredumping process may sleep for an extended period in exit_mm(),
+ * so the oom killer cannot assume that the process will promptly exit
+ * and release memory.
+ */
+ if (sig->flags & SIGNAL_GROUP_COREDUMP)
+ return false;
+
+ if (sig->flags & SIGNAL_GROUP_EXIT)
+ return true;
+
+ if (thread_group_empty(task) && (task->flags & PF_EXITING))
+ return true;
+
+ return false;
+}
+
+/*
+ * Checks whether the given task is dying or exiting and likely to
+ * release its address space. This means that all threads and processes
+ * sharing the same mm have to be killed or exiting.
+ * Caller has to make sure that task->mm is stable (hold task_lock or
+ * it operates on the current).
+ */
+bool task_will_free_mem(struct task_struct *task)
+{
+ struct mm_struct *mm = task->mm;
+ struct task_struct *p;
+ bool ret;
+
+ /*
+ * Skip tasks without mm because it might have passed its exit_mm and
+ * exit_oom_victim. oom_reaper could have rescued that but do not rely
+ * on that for now. We can consider find_lock_task_mm in future.
+ */
+ if (!mm)
+ return false;
+
+ if (!__task_will_free_mem(task))
+ return false;
+
+ /*
+ * This task has already been drained by the oom reaper so there are
+ * only small chances it will free some more
+ */
+ if (test_bit(MMF_OOM_REAPED, &mm->flags))
+ return false;
+
+ if (atomic_read(&mm->mm_users) <= 1)
+ return true;
+
+ /*
+ * This is really pessimistic but we do not have any reliable way
+ * to check that external processes share with our mm
+ */
+ rcu_read_lock();
+ for_each_process(p) {
+ if (!process_shares_mm(p, mm))
+ continue;
+ if (same_thread_group(task, p))
+ continue;
+ ret = __task_will_free_mem(p);
+ if (!ret)
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* Must be called while holding a reference to p, which will be released upon
* returning.
@@ -765,9 +828,9 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
task_lock(p);
- if (p->mm && task_will_free_mem(p)) {
+ if (task_will_free_mem(p)) {
mark_oom_victim(p);
- try_oom_reaper(p);
+ wake_oom_reaper(p);
task_unlock(p);
put_task_struct(p);
return;
@@ -850,14 +913,18 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
if (same_thread_group(p, victim))
continue;
- if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
- p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p)) {
/*
* We cannot use oom_reaper for the mm shared by this
* process because it wouldn't get killed and so the
- * memory might be still used.
+ * memory might be still used. Hide the mm from the oom
+ * killer to guarantee OOM forward progress.
*/
can_oom_reap = false;
+ set_bit(MMF_OOM_REAPED, &mm->flags);
+ pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
+ task_pid_nr(victim), victim->comm,
+ task_pid_nr(p), p->comm);
continue;
}
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
@@ -939,14 +1006,10 @@ bool out_of_memory(struct oom_control *oc)
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
- *
- * But don't select if current has already released its mm and cleared
- * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
*/
- if (current->mm &&
- (fatal_signal_pending(current) || task_will_free_mem(current))) {
+ if (task_will_free_mem(current)) {
mark_oom_victim(current);
- try_oom_reaper(current);
+ wake_oom_reaper(current);
return true;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d578d2a56b19..f4cd7d8005c9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -267,26 +267,35 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
*/
/**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
+ * node_dirtyable_memory - number of dirtyable pages in a node
+ * @pgdat: the node
*
- * Returns the zone's number of pages potentially available for dirty
- * page cache. This is the base value for the per-zone dirty limits.
+ * Returns the node's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-node dirty limits.
*/
-static unsigned long zone_dirtyable_memory(struct zone *zone)
+static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{
- unsigned long nr_pages;
+ unsigned long nr_pages = 0;
+ int z;
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ nr_pages += zone_page_state(zone, NR_FREE_PAGES);
+ }
- nr_pages = zone_page_state(zone, NR_FREE_PAGES);
/*
* Pages reserved for the kernel should not be considered
* dirtyable, to prevent a situation where reclaim has to
* clean pages in order to balance the zones.
*/
- nr_pages -= min(nr_pages, zone->totalreserve_pages);
+ nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
- nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
- nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+ nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
+ nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);
return nr_pages;
}
@@ -299,13 +308,26 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
int i;
for_each_node_state(node, N_HIGH_MEMORY) {
- for (i = 0; i < MAX_NR_ZONES; i++) {
- struct zone *z = &NODE_DATA(node)->node_zones[i];
+ for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
+ struct zone *z;
+ unsigned long nr_pages;
+
+ if (!is_highmem_idx(i))
+ continue;
+
+ z = &NODE_DATA(node)->node_zones[i];
+ if (!populated_zone(z))
+ continue;
- if (is_highmem(z))
- x += zone_dirtyable_memory(z);
+ nr_pages = zone_page_state(z, NR_FREE_PAGES);
+ /* watch for underflows */
+ nr_pages -= min(nr_pages, high_wmark_pages(z));
+ nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
+ nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
+ x += nr_pages;
}
}
+
/*
* Unreclaimable memory (kernel memory or anonymous memory
* without swap) can bring down the dirtyable pages below
@@ -348,8 +370,8 @@ static unsigned long global_dirtyable_memory(void)
*/
x -= min(x, totalreserve_pages);
- x += global_page_state(NR_INACTIVE_FILE);
- x += global_page_state(NR_ACTIVE_FILE);
+ x += global_node_page_state(NR_INACTIVE_FILE);
+ x += global_node_page_state(NR_ACTIVE_FILE);
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -445,23 +467,23 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
/**
- * zone_dirty_limit - maximum number of dirty pages allowed in a zone
- * @zone: the zone
+ * node_dirty_limit - maximum number of dirty pages allowed in a node
+ * @pgdat: the node
*
- * Returns the maximum number of dirty pages allowed in a zone, based
- * on the zone's dirtyable memory.
+ * Returns the maximum number of dirty pages allowed in a node, based
+ * on the node's dirtyable memory.
*/
-static unsigned long zone_dirty_limit(struct zone *zone)
+static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{
- unsigned long zone_memory = zone_dirtyable_memory(zone);
+ unsigned long node_memory = node_dirtyable_memory(pgdat);
struct task_struct *tsk = current;
unsigned long dirty;
if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
- zone_memory / global_dirtyable_memory();
+ node_memory / global_dirtyable_memory();
else
- dirty = vm_dirty_ratio * zone_memory / 100;
+ dirty = vm_dirty_ratio * node_memory / 100;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
dirty += dirty / 4;
@@ -470,19 +492,22 @@ static unsigned long zone_dirty_limit(struct zone *zone)
}
/**
- * zone_dirty_ok - tells whether a zone is within its dirty limits
- * @zone: the zone to check
+ * node_dirty_ok - tells whether a node is within its dirty limits
+ * @pgdat: the node to check
*
- * Returns %true when the dirty pages in @zone are within the zone's
+ * Returns %true when the dirty pages in @pgdat are within the node's
* dirty limit, %false if the limit is exceeded.
*/
-bool zone_dirty_ok(struct zone *zone)
+bool node_dirty_ok(struct pglist_data *pgdat)
{
- unsigned long limit = zone_dirty_limit(zone);
+ unsigned long limit = node_dirty_limit(pgdat);
+ unsigned long nr_pages = 0;
+
+ nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
+ nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
+ nr_pages += node_page_state(pgdat, NR_WRITEBACK);
- return zone_page_state(zone, NR_FILE_DIRTY) +
- zone_page_state(zone, NR_UNSTABLE_NFS) +
- zone_page_state(zone, NR_WRITEBACK) <= limit;
+ return nr_pages <= limit;
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -1570,10 +1595,10 @@ static void balance_dirty_pages(struct address_space *mapping,
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
+ nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
+ global_node_page_state(NR_UNSTABLE_NFS);
gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+ gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
domain_dirty_limits(gdtc);
@@ -1910,8 +1935,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
* as we're trying to decide whether to put more under writeback.
*/
gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
+ gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
+ global_node_page_state(NR_UNSTABLE_NFS);
domain_dirty_limits(gdtc);
if (gdtc->dirty > gdtc->bg_thresh)
@@ -1955,8 +1980,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
*/
dirty_thresh += dirty_thresh / 10; /* wheeee... */
- if (global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK) <= dirty_thresh)
+ if (global_node_page_state(NR_UNSTABLE_NFS) +
+ global_node_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1984,8 +2009,8 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void laptop_mode_timer_fn(unsigned long data)
{
struct request_queue *q = (struct request_queue *)data;
- int nr_pages = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
+ int nr_pages = global_node_page_state(NR_FILE_DIRTY) +
+ global_node_page_state(NR_UNSTABLE_NFS);
struct bdi_writeback *wb;
/*
@@ -2436,8 +2461,9 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
wb = inode_to_wb(inode);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
- __inc_zone_page_state(page, NR_FILE_DIRTY);
- __inc_zone_page_state(page, NR_DIRTIED);
+ __inc_node_page_state(page, NR_FILE_DIRTY);
+ __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ __inc_node_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
__inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_SIZE);
@@ -2457,7 +2483,8 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
{
if (mapping_cap_account_dirty(mapping)) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
- dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_node_page_state(page, NR_FILE_DIRTY);
+ dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_SIZE);
}
@@ -2525,7 +2552,7 @@ void account_page_redirty(struct page *page)
wb = unlocked_inode_to_wb_begin(inode, &locked);
current->nr_dirtied--;
- dec_zone_page_state(page, NR_DIRTIED);
+ dec_node_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED);
unlocked_inode_to_wb_end(inode, locked);
}
@@ -2713,7 +2740,8 @@ int clear_page_dirty_for_io(struct page *page)
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
- dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_node_page_state(page, NR_FILE_DIRTY);
+ dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
@@ -2759,8 +2787,9 @@ int test_clear_page_writeback(struct page *page)
}
if (ret) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
- dec_zone_page_state(page, NR_WRITEBACK);
- inc_zone_page_state(page, NR_WRITTEN);
+ dec_node_page_state(page, NR_WRITEBACK);
+ dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ inc_node_page_state(page, NR_WRITTEN);
}
unlock_page_memcg(page);
return ret;
@@ -2813,7 +2842,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
}
if (!ret) {
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
- inc_zone_page_state(page, NR_WRITEBACK);
+ inc_node_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
}
unlock_page_memcg(page);
return ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 452513bf02ce..ea759b935360 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
- if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
- return true;
-
- return false;
-}
-
/*
* Returns false when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
@@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
- return false;
-}
-
static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
@@ -1091,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
- nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
while (count) {
struct page *page;
@@ -1148,9 +1135,9 @@ static void free_one_page(struct zone *zone,
{
unsigned long nr_scanned;
spin_lock(&zone->lock);
- nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
@@ -2517,7 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
- /* Set the pageblock if the isolated page is at least a pageblock */
+ /*
+ * Set the pageblock if the isolated page is at least half of a
+ * pageblock
+ */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
@@ -2597,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
else
page = list_first_entry(list, struct page, lru);
- __dec_zone_state(zone, NR_ALLOC_BATCH);
list_del(&page->lru);
pcp->count--;
@@ -2623,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
spin_unlock(&zone->lock);
if (!page)
goto failed;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
}
- if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
- !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
- set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
@@ -2842,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
}
#ifdef CONFIG_NUMA
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return local_zone->node == zone->node;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return true;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
#endif /* CONFIG_NUMA */
-static void reset_alloc_batches(struct zone *preferred_zone)
-{
- struct zone *zone = preferred_zone->zone_pgdat->node_zones;
-
- do {
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
- } while (zone++ != preferred_zone);
-}
-
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -2886,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
- bool fair_skipped = false;
- bool apply_fair = (alloc_flags & ALLOC_FAIR);
+ struct pglist_data *last_pgdat_dirty_limit = NULL;
-zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
@@ -2904,50 +2864,33 @@ zonelist_scan:
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
- * Distribute pages in proportion to the individual
- * zone size to ensure fair page aging. The zone a
- * page was allocated in should have no effect on the
- * time the page has in memory before being reclaimed.
- */
- if (apply_fair) {
- if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
- fair_skipped = true;
- continue;
- }
- if (!zone_local(ac->preferred_zoneref->zone, zone)) {
- if (fair_skipped)
- goto reset_fair;
- apply_fair = false;
- }
- }
- /*
* When allocating a page cache page for writing, we
- * want to get it from a zone that is within its dirty
- * limit, such that no single zone holds more than its
+ * want to get it from a node that is within its dirty
+ * limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
- * The dirty limits take into account the zone's
+ * The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
- * This may look like it could increase pressure on
- * lower zones by failing allocations in higher zones
- * before they are full. But the pages that do spill
- * over are limited as the lower zones are protected
- * by this very same mechanism. It should not become
- * a practical burden to them.
- *
* XXX: For now, allow allocations to potentially
- * exceed the per-zone dirty limit in the slowpath
+ * exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
- * zones are together not big enough to reach the
+ * nodes are together not big enough to reach the
* global limit. The proper fix for these situations
- * will require awareness of zones in the
+ * will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
- if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
- continue;
+ if (ac->spread_dirty_pages) {
+ if (last_pgdat_dirty_limit == zone->zone_pgdat)
+ continue;
+
+ if (!node_dirty_ok(zone->zone_pgdat)) {
+ last_pgdat_dirty_limit = zone->zone_pgdat;
+ continue;
+ }
+ }
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_fast(zone, order, mark,
@@ -2959,16 +2902,16 @@ zonelist_scan:
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (zone_reclaim_mode == 0 ||
+ if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
- ret = zone_reclaim(zone, gfp_mask, order);
+ ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
+ case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
- case ZONE_RECLAIM_FULL:
+ case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
@@ -2998,23 +2941,6 @@ try_this_zone:
}
}
- /*
- * The first pass makes sure allocations are spread fairly within the
- * local node. However, the local node might have free pages left
- * after the fairness batches are exhausted, and remote zones haven't
- * even been considered yet. Try once more without fairness, and
- * include remote zones now, before entering the slowpath and waking
- * kswapd: prefer spilling to a remote zone over swapping locally.
- */
- if (fair_skipped) {
-reset_fair:
- apply_fair = false;
- fair_skipped = false;
- reset_alloc_batches(ac->preferred_zoneref->zone);
- z = ac->preferred_zoneref;
- goto zonelist_scan;
- }
-
return NULL;
}
@@ -3159,7 +3085,6 @@ out:
return page;
}
-
/*
* Maximum number of compaction retries wit a progress before OOM
* killer is consider as the only way to move forward.
@@ -3171,17 +3096,16 @@ out:
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
- int contended_compaction;
if (!order)
return NULL;
current->flags |= PF_MEMALLOC;
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- mode, &contended_compaction);
+ prio);
current->flags &= ~PF_MEMALLOC;
if (*compact_result <= COMPACT_INACTIVE)
@@ -3193,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -3211,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTFAIL);
- /*
- * In all zones where compaction was attempted (and not
- * deferred or skipped), lock contention has been detected.
- * For THP allocation we do not want to disrupt the others
- * so we fallback to base pages instead.
- */
- if (contended_compaction == COMPACT_CONTENDED_LOCK)
- *compact_result = COMPACT_CONTENDED;
-
- /*
- * If compaction was aborted due to need_resched(), we do not
- * want to further increase allocation latency, unless it is
- * khugepaged trying to collapse.
- */
- if (contended_compaction == COMPACT_CONTENDED_SCHED
- && !(current->flags & PF_KTHREAD))
- *compact_result = COMPACT_CONTENDED;
-
cond_resched();
return NULL;
@@ -3236,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
static inline bool
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
- enum compact_result compact_result, enum migrate_mode *migrate_mode,
+ enum compact_result compact_result,
+ enum compact_priority *compact_priority,
int compaction_retries)
{
int max_retries = MAX_COMPACT_RETRIES;
@@ -3247,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
/*
* compaction considers all the zone as desperately out of memory
* so it doesn't really make much sense to retry except when the
- * failure could be caused by weak migration mode.
+ * failure could be caused by insufficient priority
*/
if (compaction_failed(compact_result)) {
- if (*migrate_mode == MIGRATE_ASYNC) {
- *migrate_mode = MIGRATE_SYNC_LIGHT;
+ if (*compact_priority > MIN_COMPACT_PRIORITY) {
+ (*compact_priority)--;
return true;
}
return false;
@@ -3285,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result)
{
*compact_result = COMPACT_SKIPPED;
return NULL;
@@ -3294,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
- enum migrate_mode *migrate_mode,
+ enum compact_priority *compact_priority,
int compaction_retries)
{
struct zone *zone;
@@ -3362,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
return NULL;
retry:
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -3384,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
+ pg_data_t *last_pgdat = NULL;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
- ac->high_zoneidx, ac->nodemask)
- wakeup_kswapd(zone, order, ac_classzone_idx(ac));
+ ac->high_zoneidx, ac->nodemask) {
+ if (last_pgdat != zone->zone_pgdat)
+ wakeup_kswapd(zone, order, ac->high_zoneidx);
+ last_pgdat = zone->zone_pgdat;
+ }
}
static inline unsigned int
@@ -3421,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
- if (gfp_mask & __GFP_MEMALLOC)
- alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- alloc_flags |= ALLOC_NO_WATERMARKS;
- }
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
@@ -3440,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
- return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
-}
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
-static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
-{
- return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+ if (gfp_mask & __GFP_MEMALLOC)
+ return true;
+ if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+ return true;
+ if (!in_interrupt() &&
+ ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
+ return true;
+
+ return false;
}
/*
@@ -3481,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
return false;
/*
- * Keep reclaiming pages while there is a chance this will lead somewhere.
- * If none of the target zones can satisfy our allocation request even
- * if all reclaimable pages are considered then we are screwed and have
- * to go OOM.
+ * Keep reclaiming pages while there is a chance this will lead
+ * somewhere. If none of the target zones can satisfy our allocation
+ * request even if all reclaimable pages are considered then we are
+ * screwed and have to go OOM.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
@@ -3509,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* prevent from pre mature OOM
*/
if (!did_some_progress) {
- unsigned long writeback;
- unsigned long dirty;
+ unsigned long write_pending;
- writeback = zone_page_state_snapshot(zone,
- NR_WRITEBACK);
- dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+ write_pending = zone_page_state_snapshot(zone,
+ NR_ZONE_WRITE_PENDING);
- if (2*(writeback + dirty) > reclaimable) {
+ if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return true;
}
@@ -3551,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
- enum migrate_mode migration_mode = MIGRATE_ASYNC;
+ enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
enum compact_result compact_result;
int compaction_retries = 0;
int no_progress_loops = 0;
@@ -3575,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry:
+ /*
+ * The fast path uses conservative alloc_flags to succeed only until
+ * kswapd needs to be woken up, and to avoid the cost of setting up
+ * alloc_flags precisely. So we do that now.
+ */
+ alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
/*
- * OK, we're below the kswapd watermark and have kicked background
- * reclaim. Now things get more complex, so set up alloc_flags according
- * to how we want to proceed.
+ * The adjusted alloc_flags might result in immediate success, so try
+ * that first
*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ if (page)
+ goto got_pg;
+
+ /*
+ * For costly allocations, try direct compaction first, as it's likely
+ * that we have enough base pages and don't need to reclaim. Don't try
+ * that for allocations that are allowed to ignore watermarks, as the
+ * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+ */
+ if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
+ !gfp_pfmemalloc_allowed(gfp_mask)) {
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ alloc_flags, ac,
+ INIT_COMPACT_PRIORITY,
+ &compact_result);
+ if (page)
+ goto got_pg;
+
+ /*
+ * Checks for costly allocations with __GFP_NORETRY, which
+ * includes THP page fault allocations
+ */
+ if (gfp_mask & __GFP_NORETRY) {
+ /*
+ * If compaction is deferred for high-order allocations,
+ * it is because sync compaction recently failed. If
+ * this is the case and the caller requested a THP
+ * allocation, we do not want to heavily disrupt the
+ * system, so we fail the allocation instead of entering
+ * direct reclaim.
+ */
+ if (compact_result == COMPACT_DEFERRED)
+ goto nopage;
+
+ /*
+ * Looks like reclaim/compaction is worth trying, but
+ * sync compaction could be very expensive, so keep
+ * using async compaction.
+ */
+ compact_priority = INIT_COMPACT_PRIORITY;
+ }
+ }
+
+retry:
+ /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ wake_all_kswapds(order, ac);
+
+ if (gfp_pfmemalloc_allowed(gfp_mask))
+ alloc_flags = ALLOC_NO_WATERMARKS;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
- /* This is the last chance, in general, before the goto nopage. */
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ /* Attempt with potentially adjusted zonelist and alloc_flags */
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
- /* Allocate without watermarks if the context allows */
- if (alloc_flags & ALLOC_NO_WATERMARKS) {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS, ac);
- if (page)
- goto got_pg;
- }
-
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim) {
/*
@@ -3640,38 +3590,6 @@ retry:
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
- /*
- * Try direct compaction. The first pass is asynchronous. Subsequent
- * attempts after direct reclaim are synchronous
- */
- page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
- migration_mode,
- &compact_result);
- if (page)
- goto got_pg;
-
- /* Checks for THP-specific high-order allocations */
- if (is_thp_gfp_mask(gfp_mask)) {
- /*
- * If compaction is deferred for high-order allocations, it is
- * because sync compaction recently failed. If this is the case
- * and the caller requested a THP allocation, we do not want
- * to heavily disrupt the system, so we fail the allocation
- * instead of entering direct reclaim.
- */
- if (compact_result == COMPACT_DEFERRED)
- goto nopage;
-
- /*
- * Compaction is contended so rather back off than cause
- * excessive stalls.
- */
- if(compact_result == COMPACT_CONTENDED)
- goto nopage;
- }
-
- if (order && compaction_made_progress(compact_result))
- compaction_retries++;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3679,16 +3597,25 @@ retry:
if (page)
goto got_pg;
+ /* Try direct compaction and then allocating */
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+ compact_priority, &compact_result);
+ if (page)
+ goto got_pg;
+
+ if (order && compaction_made_progress(compact_result))
+ compaction_retries++;
+
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
- goto noretry;
+ goto nopage;
/*
* Do not retry costly high order allocations unless they are
* __GFP_REPEAT
*/
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
- goto noretry;
+ goto nopage;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -3712,7 +3639,7 @@ retry:
*/
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
- compact_result, &migration_mode,
+ compact_result, &compact_priority,
compaction_retries))
goto retry;
@@ -3727,25 +3654,6 @@ retry:
goto retry;
}
-noretry:
- /*
- * High-order allocations do not necessarily loop after direct reclaim
- * and reclaim/compaction depends on compaction being called after
- * reclaim so call directly if necessary.
- * It can become very expensive to allocate transparent hugepages at
- * fault, so use asynchronous memory compaction for THP unless it is
- * khugepaged trying to collapse. All other requests should tolerate
- * at least light sync migration.
- */
- if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
- migration_mode = MIGRATE_ASYNC;
- else
- migration_mode = MIGRATE_SYNC_LIGHT;
- page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
- ac, migration_mode,
- &compact_result);
- if (page)
- goto got_pg;
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
got_pg:
@@ -3761,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
struct page *page;
unsigned int cpuset_mems_cookie;
- unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask),
@@ -4192,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
- val->sharedram = global_page_state(NR_SHMEM);
+ val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
@@ -4214,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
managed_pages += pgdat->node_zones[zone_type].managed_pages;
val->totalram = managed_pages;
- val->sharedram = node_page_state(nid, NR_SHMEM);
- val->freeram = node_page_state(nid, NR_FREE_PAGES);
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
@@ -4298,6 +4206,7 @@ void show_free_areas(unsigned int filter)
unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
+ pg_data_t *pgdat;
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4312,35 +4221,74 @@ void show_free_areas(unsigned int filter)
" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n"
-#endif
" free:%lu free_pcp:%lu free_cma:%lu\n",
- global_page_state(NR_ACTIVE_ANON),
- global_page_state(NR_INACTIVE_ANON),
- global_page_state(NR_ISOLATED_ANON),
- global_page_state(NR_ACTIVE_FILE),
- global_page_state(NR_INACTIVE_FILE),
- global_page_state(NR_ISOLATED_FILE),
- global_page_state(NR_UNEVICTABLE),
- global_page_state(NR_FILE_DIRTY),
- global_page_state(NR_WRITEBACK),
- global_page_state(NR_UNSTABLE_NFS),
+ global_node_page_state(NR_ACTIVE_ANON),
+ global_node_page_state(NR_INACTIVE_ANON),
+ global_node_page_state(NR_ISOLATED_ANON),
+ global_node_page_state(NR_ACTIVE_FILE),
+ global_node_page_state(NR_INACTIVE_FILE),
+ global_node_page_state(NR_ISOLATED_FILE),
+ global_node_page_state(NR_UNEVICTABLE),
+ global_node_page_state(NR_FILE_DIRTY),
+ global_node_page_state(NR_WRITEBACK),
+ global_node_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
- global_page_state(NR_FILE_MAPPED),
- global_page_state(NR_SHMEM),
+ global_node_page_state(NR_FILE_MAPPED),
+ global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR,
- global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR,
- global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR,
-#endif
global_page_state(NR_FREE_PAGES),
free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
+ for_each_online_pgdat(pgdat) {
+ printk("Node %d"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
+ " mapped:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
+ " writeback_tmp:%lukB"
+ " unstable:%lukB"
+ " pages_scanned:%lu"
+ " all_unreclaimable? %s"
+ "\n",
+ pgdat->node_id,
+ K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_UNEVICTABLE)),
+ K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+ K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+ K(node_page_state(pgdat, NR_FILE_MAPPED)),
+ K(node_page_state(pgdat, NR_FILE_DIRTY)),
+ K(node_page_state(pgdat, NR_WRITEBACK)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
+ * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+#endif
+ K(node_page_state(pgdat, NR_SHMEM)),
+ K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+ node_page_state(pgdat, NR_PAGES_SCANNED),
+ !pgdat_reclaimable(pgdat) ? "yes" : "no");
+ }
+
for_each_populated_zone(zone) {
int i;
@@ -4362,72 +4310,41 @@ void show_free_areas(unsigned int filter)
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
- " isolated(anon):%lukB"
- " isolated(file):%lukB"
+ " writepending:%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " dirty:%lukB"
- " writeback:%lukB"
- " mapped:%lukB"
- " shmem:%lukB"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " shmem_thp: %lukB"
- " shmem_pmdmapped: %lukB"
- " anon_thp: %lukB"
-#endif
" slab_reclaimable:%lukB"
" slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
" pagetables:%lukB"
- " unstable:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
" local_pcp:%ukB"
" free_cma:%lukB"
- " writeback_tmp:%lukB"
- " pages_scanned:%lu"
- " all_unreclaimable? %s"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
- K(zone_page_state(zone, NR_ACTIVE_ANON)),
- K(zone_page_state(zone, NR_INACTIVE_ANON)),
- K(zone_page_state(zone, NR_ACTIVE_FILE)),
- K(zone_page_state(zone, NR_INACTIVE_FILE)),
- K(zone_page_state(zone, NR_UNEVICTABLE)),
- K(zone_page_state(zone, NR_ISOLATED_ANON)),
- K(zone_page_state(zone, NR_ISOLATED_FILE)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_FILE_DIRTY)),
- K(zone_page_state(zone, NR_WRITEBACK)),
- K(zone_page_state(zone, NR_FILE_MAPPED)),
- K(zone_page_state(zone, NR_SHMEM)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR),
- K(zone_page_state(zone, NR_SHMEM_PMDMAPPED)
- * HPAGE_PMD_NR),
- K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR),
-#endif
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
- zone_page_state(zone, NR_KERNEL_STACK) *
- THREAD_SIZE / 1024,
+ zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
- K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)),
- K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
- K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
- K(zone_page_state(zone, NR_PAGES_SCANNED)),
- (!zone_reclaimable(zone) ? "yes" : "no")
- );
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %ld", zone->lowmem_reserve[i]);
@@ -4469,7 +4386,7 @@ void show_free_areas(unsigned int filter)
hugetlb_show_meminfo();
- printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+ printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
show_swap_cache_info();
}
@@ -5340,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone)
zone->pageset = alloc_percpu(struct per_cpu_pageset);
for_each_possible_cpu(cpu)
zone_pageset_init(zone, cpu);
+
+ if (!zone->zone_pgdat->per_cpu_nodestats) {
+ zone->zone_pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
+ }
}
/*
@@ -5909,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kcompactd_wait);
#endif
pgdat_page_ext_init(pgdat);
+ spin_lock_init(&pgdat->lru_lock);
+ lruvec_init(node_lruvec(pgdat));
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
@@ -5958,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+ pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+ pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
+ zone->zone_pgdat = pgdat;
spin_lock_init(&zone->lock);
- spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
- zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- /* For bootup, initialized properly in watermark setup */
- mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
-
- lruvec_init(&zone->lruvec);
if (!size)
continue;
@@ -6038,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
- WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
+ pgdat->per_cpu_nodestats = NULL;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
@@ -6699,6 +6619,9 @@ static void calculate_totalreserve_pages(void)
enum zone_type i, j;
for_each_online_pgdat(pgdat) {
+
+ pgdat->totalreserve_pages = 0;
+
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
long max = 0;
@@ -6715,7 +6638,7 @@ static void calculate_totalreserve_pages(void)
if (max > zone->managed_pages)
max = zone->managed_pages;
- zone->totalreserve_pages = max;
+ pgdat->totalreserve_pages += max;
reserve_pages += max;
}
@@ -6816,10 +6739,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6930,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6937,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_unmapped_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
sysctl_min_unmapped_ratio) / 100;
return 0;
}
@@ -6946,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6953,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_slab_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 4ea9c4ef5146..ae11aa914e55 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -41,12 +41,12 @@ static struct page *page_idle_get_page(unsigned long pfn)
return NULL;
zone = page_zone(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
if (unlikely(!PageLRU(page))) {
put_page(page);
page = NULL;
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
return page;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index dcc5d3769608..fb1fa269d3a0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -166,6 +166,8 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
unsigned block_in_page;
sector_t first_block;
+ cond_resched();
+
first_block = bmap(inode, probe_block);
if (first_block == 0)
goto bad_bmap;
diff --git a/mm/rmap.c b/mm/rmap.c
index 8a13d9f7b566..709bc83703b1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -27,7 +27,7 @@
* mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
@@ -1213,8 +1213,8 @@ void do_page_add_anon_rmap(struct page *page,
* disabled.
*/
if (compound)
- __inc_zone_page_state(page, NR_ANON_THPS);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
+ __inc_node_page_state(page, NR_ANON_THPS);
+ __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
}
if (unlikely(PageKsm(page)))
return;
@@ -1251,14 +1251,14 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
- __inc_zone_page_state(page, NR_ANON_THPS);
+ __inc_node_page_state(page, NR_ANON_THPS);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0);
}
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
+ __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1282,7 +1282,7 @@ void page_add_file_rmap(struct page *page, bool compound)
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED);
+ __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
} else {
if (PageTransCompound(page)) {
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1293,7 +1293,7 @@ void page_add_file_rmap(struct page *page, bool compound)
if (!atomic_inc_and_test(&page->_mapcount))
goto out;
}
- __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr);
+ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
out:
unlock_page_memcg(page);
@@ -1322,18 +1322,18 @@ static void page_remove_file_rmap(struct page *page, bool compound)
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
goto out;
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED);
+ __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
}
/*
- * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * We use the irq-unsafe __{inc|mod}_zone_page_state because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr);
+ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
if (unlikely(PageMlocked(page)))
@@ -1356,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_zone_page_state(page, NR_ANON_THPS);
+ __dec_node_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/*
@@ -1375,7 +1375,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
clear_page_mlock(page);
if (nr) {
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+ __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
deferred_split_huge_page(page);
}
}
@@ -1404,7 +1404,7 @@ void page_remove_rmap(struct page *page, bool compound)
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __dec_zone_page_state(page, NR_ANON_PAGES);
+ __dec_node_page_state(page, NR_ANON_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index 62e42c7d544c..2ac19a61d565 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -575,9 +575,9 @@ static int shmem_add_to_page_cache(struct page *page,
if (!error) {
mapping->nrpages += nr;
if (PageTransHuge(page))
- __inc_zone_page_state(page, NR_SHMEM_THPS);
- __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr);
- __mod_zone_page_state(page_zone(page), NR_SHMEM, nr);
+ __inc_node_page_state(page, NR_SHMEM_THPS);
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+ __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
@@ -601,8 +601,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
- __dec_zone_page_state(page, NR_FILE_PAGES);
- __dec_zone_page_state(page, NR_SHMEM);
+ __dec_node_page_state(page, NR_FILE_PAGES);
+ __dec_node_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
put_page(page);
BUG_ON(error);
@@ -1493,8 +1493,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
newpage);
if (!error) {
- __inc_zone_page_state(newpage, NR_FILE_PAGES);
- __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ __inc_node_page_state(newpage, NR_FILE_PAGES);
+ __dec_node_page_state(oldpage, NR_FILE_PAGES);
}
spin_unlock_irq(&swap_mapping->tree_lock);
diff --git a/mm/slab.h b/mm/slab.h
index f33980ab0406..9653f2e2591a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -369,6 +369,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
return s->object_size;
# endif
+ if (s->flags & SLAB_KASAN)
+ return s->object_size;
/*
* If we have the need to store the freelist pointer
* back there or track user information then we can
diff --git a/mm/slub.c b/mm/slub.c
index f9da8716b8b3..74e7c8c30db8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,7 +124,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
-static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+inline void *fixup_red_left(struct kmem_cache *s, void *p)
{
if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
p += s->red_left_pad;
@@ -454,8 +454,6 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
*/
#if defined(CONFIG_SLUB_DEBUG_ON)
static int slub_debug = DEBUG_DEFAULT_FLAGS;
-#elif defined(CONFIG_KASAN)
-static int slub_debug = SLAB_STORE_USER;
#else
static int slub_debug;
#endif
@@ -660,6 +658,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
+ off += kasan_metadata_size(s);
+
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
print_section("Padding ", p + off, size_from_object(s) - off);
@@ -787,6 +787,8 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
/* We also have user information there */
off += 2 * sizeof(struct track);
+ off += kasan_metadata_size(s);
+
if (size_from_object(s) == off)
return 1;
@@ -1322,8 +1324,10 @@ static inline void kfree_hook(const void *x)
kasan_kfree_large(x);
}
-static inline void slab_free_hook(struct kmem_cache *s, void *x)
+static inline void *slab_free_hook(struct kmem_cache *s, void *x)
{
+ void *freeptr;
+
kmemleak_free_recursive(x, s->flags);
/*
@@ -1344,7 +1348,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
+ freeptr = get_freepointer(s, x);
+ /*
+ * kasan_slab_free() may put x into memory quarantine, delaying its
+ * reuse. In this case the object's freelist pointer is changed.
+ */
kasan_slab_free(s, x);
+ return freeptr;
}
static inline void slab_free_freelist_hook(struct kmem_cache *s,
@@ -1362,11 +1372,11 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
void *object = head;
void *tail_obj = tail ? : head;
+ void *freeptr;
do {
- slab_free_hook(s, object);
- } while ((object != tail_obj) &&
- (object = get_freepointer(s, object)));
+ freeptr = slab_free_hook(s, object);
+ } while ((object != tail_obj) && (object = freeptr));
#endif
}
@@ -2878,16 +2888,13 @@ slab_empty:
* same page) possible by specifying head and tail ptr, plus objects
* count (cnt). Bulk free indicated by tail pointer being set.
*/
-static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
- void *head, void *tail, int cnt,
- unsigned long addr)
+static __always_inline void do_slab_free(struct kmem_cache *s,
+ struct page *page, void *head, void *tail,
+ int cnt, unsigned long addr)
{
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
-
- slab_free_freelist_hook(s, head, tail);
-
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -2921,6 +2928,27 @@ redo:
}
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int cnt,
+ unsigned long addr)
+{
+ slab_free_freelist_hook(s, head, tail);
+ /*
+ * slab_free_freelist_hook() could have put the items into quarantine.
+ * If so, no need to free them.
+ */
+ if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
+ return;
+ do_slab_free(s, page, head, tail, cnt, addr);
+}
+
+#ifdef CONFIG_KASAN
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
+{
+ do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
+}
+#endif
+
void kmem_cache_free(struct kmem_cache *s, void *x)
{
s = cache_from_obj(s, x);
@@ -3363,7 +3391,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
unsigned long flags = s->flags;
- unsigned long size = s->object_size;
+ size_t size = s->object_size;
int order;
/*
@@ -3422,7 +3450,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the object.
*/
size += 2 * sizeof(struct track);
+#endif
+ kasan_cache_create(s, &size, &s->flags);
+#ifdef CONFIG_SLUB_DEBUG
if (flags & SLAB_RED_ZONE) {
/*
* Add some empty padding so that we can catch
diff --git a/mm/sparse.c b/mm/sparse.c
index 5d0cf4540364..36d7bbb80e49 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -100,11 +100,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
}
#endif
-/*
- * Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case because
- * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
- */
+#ifdef CONFIG_SPARSEMEM_EXTREME
int __section_nr(struct mem_section* ms)
{
unsigned long root_nr;
@@ -123,6 +119,12 @@ int __section_nr(struct mem_section* ms)
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
+#else
+int __section_nr(struct mem_section* ms)
+{
+ return (int)(ms - mem_section[0]);
+}
+#endif
/*
* During early boot, before section_mem_map is used for an actual
diff --git a/mm/swap.c b/mm/swap.c
index 616df4ddd870..75c63bb2a1da 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -62,12 +62,12 @@ static void __page_cache_release(struct page *page)
struct lruvec *lruvec;
unsigned long flags;
- spin_lock_irqsave(&zone->lru_lock, flags);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ spin_lock_irqsave(zone_lru_lock(zone), flags);
+ lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ spin_unlock_irqrestore(zone_lru_lock(zone), flags);
}
mem_cgroup_uncharge(page);
}
@@ -179,26 +179,26 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
void *arg)
{
int i;
- struct zone *zone = NULL;
+ struct pglist_data *pgdat = NULL;
struct lruvec *lruvec;
unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);
+ struct pglist_data *pagepgdat = page_pgdat(page);
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = pagezone;
- spin_lock_irqsave(&zone->lru_lock, flags);
+ if (pagepgdat != pgdat) {
+ if (pgdat)
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ pgdat = pagepgdat;
+ spin_lock_irqsave(&pgdat->lru_lock, flags);
}
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
(*move_fn)(page, lruvec, arg);
}
- if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ if (pgdat)
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
@@ -318,9 +318,9 @@ void activate_page(struct page *page)
struct zone *zone = page_zone(page);
page = compound_head(page);
- spin_lock_irq(&zone->lru_lock);
- __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
- spin_unlock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
+ __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
+ spin_unlock_irq(zone_lru_lock(zone));
}
#endif
@@ -445,16 +445,16 @@ void lru_cache_add(struct page *page)
*/
void add_page_to_unevictable_list(struct page *page)
{
- struct zone *zone = page_zone(page);
+ struct pglist_data *pgdat = page_pgdat(page);
struct lruvec *lruvec;
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ spin_lock_irq(&pgdat->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
ClearPageActive(page);
SetPageUnevictable(page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
}
/**
@@ -730,7 +730,7 @@ void release_pages(struct page **pages, int nr, bool cold)
{
int i;
LIST_HEAD(pages_to_free);
- struct zone *zone = NULL;
+ struct pglist_data *locked_pgdat = NULL;
struct lruvec *lruvec;
unsigned long uninitialized_var(flags);
unsigned int uninitialized_var(lock_batch);
@@ -741,11 +741,11 @@ void release_pages(struct page **pages, int nr, bool cold)
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
- * same zone. The lock is held only if zone != NULL.
+ * same pgdat. The lock is held only if pgdat != NULL.
*/
- if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
+ if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ locked_pgdat = NULL;
}
if (is_huge_zero_page(page)) {
@@ -758,27 +758,27 @@ void release_pages(struct page **pages, int nr, bool cold)
continue;
if (PageCompound(page)) {
- if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
+ if (locked_pgdat) {
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ locked_pgdat = NULL;
}
__put_compound_page(page);
continue;
}
if (PageLRU(page)) {
- struct zone *pagezone = page_zone(page);
+ struct pglist_data *pgdat = page_pgdat(page);
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irqrestore(&zone->lru_lock,
+ if (pgdat != locked_pgdat) {
+ if (locked_pgdat)
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock,
flags);
lock_batch = 0;
- zone = pagezone;
- spin_lock_irqsave(&zone->lru_lock, flags);
+ locked_pgdat = pgdat;
+ spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
}
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -789,8 +789,8 @@ void release_pages(struct page **pages, int nr, bool cold)
list_add(&page->lru, &pages_to_free);
}
- if (zone)
- spin_unlock_irqrestore(&zone->lru_lock, flags);
+ if (locked_pgdat)
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
mem_cgroup_uncharge_list(&pages_to_free);
free_hot_cold_page_list(&pages_to_free, cold);
@@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
VM_BUG_ON_PAGE(PageCompound(page_tail), page);
VM_BUG_ON_PAGE(PageLRU(page_tail), page);
VM_BUG_ON(NR_CPUS != 1 &&
- !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
+ !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
if (!list)
SetPageLRU(page_tail);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c99463ac02fb..c8310a37be3a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -95,7 +95,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
entry.val, page);
if (likely(!error)) {
address_space->nrpages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_node_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(add_total);
}
spin_unlock_irq(&address_space->tree_lock);
@@ -147,7 +147,7 @@ void __delete_from_swap_cache(struct page *page)
set_page_private(page, 0);
ClearPageSwapCache(page);
address_space->nrpages--;
- __dec_zone_page_state(page, NR_FILE_PAGES);
+ __dec_node_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total);
}
diff --git a/mm/util.c b/mm/util.c
index 8d010ef2ce1c..662cddf914af 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -528,7 +528,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
+ free += global_node_page_state(NR_FILE_PAGES);
/*
* shmem pages shouldn't be counted as free in this
@@ -536,7 +536,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* that won't affect the overall amount of available
* memory in the system.
*/
- free -= global_page_state(NR_SHMEM);
+ free -= global_node_page_state(NR_SHMEM);
free += get_nr_swap_pages();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 21d417ccff69..650d26832569 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -84,6 +84,9 @@ struct scan_control {
/* Scan (total_size >> priority) pages at once */
int priority;
+ /* The highest zone to isolate pages for reclaim from */
+ enum zone_type reclaim_idx;
+
unsigned int may_writepage:1;
/* Can mapped pages be reclaimed? */
@@ -191,26 +194,44 @@ static bool sane_reclaim(struct scan_control *sc)
}
#endif
+/*
+ * This misses isolated pages which are not accounted for to save counters.
+ * As the data only determines if reclaim or compaction continues, it is
+ * not expected that isolated pages will be a dominating factor.
+ */
unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
- nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
- zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
- zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
+ nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
+
+ return nr;
+}
+
+unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
+{
+ unsigned long nr;
+
+ nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
+ node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
+ node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
- nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
- zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
- zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+ nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
+ node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
+ node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
return nr;
}
-bool zone_reclaimable(struct zone *zone)
+bool pgdat_reclaimable(struct pglist_data *pgdat)
{
- return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
- zone_reclaimable_pages(zone) * 6;
+ return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
+ pgdat_reclaimable_pages(pgdat) * 6;
}
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -218,7 +239,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
- return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
+ return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
}
/*
@@ -593,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page);
- inc_zone_page_state(page, NR_VMSCAN_WRITE);
+ inc_node_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -877,7 +898,7 @@ static void page_check_dirty_writeback(struct page *page,
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct zone *zone,
+ struct pglist_data *pgdat,
struct scan_control *sc,
enum ttu_flags ttu_flags,
unsigned long *ret_nr_dirty,
@@ -917,7 +938,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON_PAGE(PageActive(page), page);
- VM_BUG_ON_PAGE(page_zone(page) != zone, page);
sc->nr_scanned++;
@@ -996,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
- test_bit(ZONE_WRITEBACK, &zone->flags)) {
+ test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
nr_immediate++;
goto keep_locked;
@@ -1092,14 +1112,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (page_is_file_cache(page) &&
(!current_is_kswapd() ||
- !test_bit(ZONE_DIRTY, &zone->flags))) {
+ !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
* except we already have the page isolated
* and know it's dirty
*/
- inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+ inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
SetPageReclaim(page);
goto keep_locked;
@@ -1266,11 +1286,11 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
}
- ret = shrink_page_list(&clean_pages, zone, &sc,
+ ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
TTU_UNMAP|TTU_IGNORE_ACCESS,
&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
list_splice(&clean_pages, page_list);
- mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
}
@@ -1348,8 +1368,31 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
return ret;
}
+
/*
- * zone->lru_lock is heavily contended. Some of the functions that
+ * Update LRU sizes after isolating pages. The LRU size updates must
+ * be complete before mem_cgroup_update_lru_size due to a santity check.
+ */
+static __always_inline void update_lru_sizes(struct lruvec *lruvec,
+ enum lru_list lru, unsigned long *nr_zone_taken,
+ unsigned long nr_taken)
+{
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ if (!nr_zone_taken[zid])
+ continue;
+
+ __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
+ }
+
+#ifdef CONFIG_MEMCG
+ mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+#endif
+}
+
+/*
+ * zone_lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
@@ -1375,10 +1418,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
{
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
- unsigned long scan;
+ unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+ unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
+ unsigned long scan, nr_pages;
+ LIST_HEAD(pages_skipped);
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
- !list_empty(src); scan++) {
+ !list_empty(src);) {
struct page *page;
page = lru_to_page(src);
@@ -1386,9 +1432,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
VM_BUG_ON_PAGE(!PageLRU(page), page);
+ if (page_zonenum(page) > sc->reclaim_idx) {
+ list_move(&page->lru, &pages_skipped);
+ nr_skipped[page_zonenum(page)]++;
+ continue;
+ }
+
+ /*
+ * Account for scanned and skipped separetly to avoid the pgdat
+ * being prematurely marked unreclaimable by pgdat_reclaimable.
+ */
+ scan++;
+
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_taken += hpage_nr_pages(page);
+ nr_pages = hpage_nr_pages(page);
+ nr_taken += nr_pages;
+ nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
break;
@@ -1402,9 +1462,38 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
}
}
+ /*
+ * Splice any skipped pages to the start of the LRU list. Note that
+ * this disrupts the LRU order when reclaiming for lower zones but
+ * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
+ * scanning would soon rescan the same pages to skip and put the
+ * system at risk of premature OOM.
+ */
+ if (!list_empty(&pages_skipped)) {
+ int zid;
+ unsigned long total_skipped = 0;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ if (!nr_skipped[zid])
+ continue;
+
+ __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+ total_skipped += nr_skipped[zid];
+ }
+
+ /*
+ * Account skipped pages as a partial scan as the pgdat may be
+ * close to unreclaimable. If the LRU list is empty, account
+ * skipped pages as a full scan.
+ */
+ scan += list_empty(src) ? total_skipped : total_skipped >> 2;
+
+ list_splice(&pages_skipped, src);
+ }
*nr_scanned = scan;
- trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+ trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
nr_taken, mode, is_file_lru(lru));
+ update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
return nr_taken;
}
@@ -1444,8 +1533,8 @@ int isolate_lru_page(struct page *page)
struct zone *zone = page_zone(page);
struct lruvec *lruvec;
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ spin_lock_irq(zone_lru_lock(zone));
+ lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
if (PageLRU(page)) {
int lru = page_lru(page);
get_page(page);
@@ -1453,7 +1542,7 @@ int isolate_lru_page(struct page *page)
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
return ret;
}
@@ -1465,7 +1554,7 @@ int isolate_lru_page(struct page *page)
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
*/
-static int too_many_isolated(struct zone *zone, int file,
+static int too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
@@ -1477,11 +1566,11 @@ static int too_many_isolated(struct zone *zone, int file,
return 0;
if (file) {
- inactive = zone_page_state(zone, NR_INACTIVE_FILE);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+ inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
+ isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
} else {
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+ inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
+ isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
}
/*
@@ -1499,7 +1588,7 @@ static noinline_for_stack void
putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
{
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
LIST_HEAD(pages_to_free);
/*
@@ -1512,13 +1601,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
continue;
}
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
SetPageLRU(page);
lru = page_lru(page);
@@ -1535,10 +1624,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
}
@@ -1563,8 +1652,32 @@ static int current_may_throttle(void)
bdi_write_congested(current->backing_dev_info);
}
+static bool inactive_reclaimable_pages(struct lruvec *lruvec,
+ struct scan_control *sc, enum lru_list lru)
+{
+ int zid;
+ struct zone *zone;
+ int file = is_file_lru(lru);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ if (!global_reclaim(sc))
+ return true;
+
+ for (zid = sc->reclaim_idx; zid >= 0; zid--) {
+ zone = &pgdat->node_zones[zid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
+ LRU_FILE * file) >= SWAP_CLUSTER_MAX)
+ return true;
+ }
+
+ return false;
+}
+
/*
- * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
static noinline_for_stack unsigned long
@@ -1582,10 +1695,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_immediate = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- while (unlikely(too_many_isolated(zone, file, sc))) {
+ if (!inactive_reclaimable_pages(lruvec, sc, lru))
+ return 0;
+
+ while (unlikely(too_many_isolated(pgdat, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
@@ -1600,48 +1716,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);
- update_lru_size(lruvec, lru, -nr_taken);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+ __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
else
- __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
+ __count_vm_events(PGSCAN_DIRECT, nr_scanned);
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+ nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
&nr_dirty, &nr_unqueued_dirty, &nr_congested,
&nr_writeback, &nr_immediate,
false);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
if (global_reclaim(sc)) {
if (current_is_kswapd())
- __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
- nr_reclaimed);
+ __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
else
- __count_zone_vm_events(PGSTEAL_DIRECT, zone,
- nr_reclaimed);
+ __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
}
putback_inactive_pages(lruvec, &page_list);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&page_list);
free_hot_cold_page_list(&page_list, true);
@@ -1661,7 +1774,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* are encountered in the nr_immediate check below.
*/
if (nr_writeback && nr_writeback == nr_taken)
- set_bit(ZONE_WRITEBACK, &zone->flags);
+ set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/*
* Legacy memcg will stall in page writeback so avoid forcibly
@@ -1673,16 +1786,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* backed by a congested BDI and wait_iff_congested will stall.
*/
if (nr_dirty && nr_dirty == nr_congested)
- set_bit(ZONE_CONGESTED, &zone->flags);
+ set_bit(PGDAT_CONGESTED, &pgdat->flags);
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not keeping up. In this case, flag
- * the zone ZONE_DIRTY and kswapd will start writing pages from
+ * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
* reclaim context.
*/
if (nr_unqueued_dirty == nr_taken)
- set_bit(ZONE_DIRTY, &zone->flags);
+ set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
* If kswapd scans pages marked marked for immediate
@@ -1701,9 +1814,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
*/
if (!sc->hibernation_mode && !current_is_kswapd() &&
current_may_throttle())
- wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+ wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
- trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
+ trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+ nr_scanned, nr_reclaimed,
sc->priority, file);
return nr_reclaimed;
}
@@ -1715,9 +1829,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation. But if
+ * appropriate to hold zone_lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page. It's impossible to balance
+ * should drop zone_lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
@@ -1731,20 +1845,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
struct list_head *pages_to_free,
enum lru_list lru)
{
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
unsigned long pgmoved = 0;
struct page *page;
int nr_pages;
while (!list_empty(list)) {
page = lru_to_page(list);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
- update_lru_size(lruvec, lru, nr_pages);
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
pgmoved += nr_pages;
@@ -1754,10 +1868,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, pages_to_free);
}
@@ -1783,7 +1897,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
unsigned long nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
@@ -1792,20 +1906,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
- update_lru_size(lruvec, lru, -nr_taken);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc))
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
- __count_zone_vm_events(PGREFILL, zone, nr_scanned);
+ __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
+ __count_vm_events(PGREFILL, nr_scanned);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
@@ -1850,7 +1963,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
/*
* Move pages back to the lru list.
*/
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
/*
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
@@ -1861,8 +1974,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&zone->lru_lock);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&l_hold);
free_hot_cold_page_list(&l_hold, true);
@@ -1894,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 1TB 101 10GB
* 10TB 320 32GB
*/
-static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+ struct scan_control *sc)
{
unsigned long inactive_ratio;
unsigned long inactive;
unsigned long active;
unsigned long gb;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ int zid;
/*
* If we don't have swap space, anonymous page deactivation
@@ -1911,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
+ /*
+ * For zone-constrained allocations, it is necessary to check if
+ * deactivations are required for lowmem to be reclaimed. This
+ * calculates the inactive/active pages available in eligible zones.
+ */
+ for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+ unsigned long inactive_zone, active_zone;
+
+ if (!populated_zone(zone))
+ continue;
+
+ inactive_zone = zone_page_state(zone,
+ NR_ZONE_LRU_BASE + (file * LRU_FILE));
+ active_zone = zone_page_state(zone,
+ NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
+
+ inactive -= min(inactive, inactive_zone);
+ active -= min(active, active_zone);
+ }
+
gb = (inactive + active) >> (30 - PAGE_SHIFT);
if (gb)
inactive_ratio = int_sqrt(10 * gb);
@@ -1924,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -1956,7 +2093,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
unsigned long anon, file;
@@ -1977,7 +2114,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* well.
*/
if (current_is_kswapd()) {
- if (!zone_reclaimable(zone))
+ if (!pgdat_reclaimable(pgdat))
force_scan = true;
if (!mem_cgroup_online(memcg))
force_scan = true;
@@ -2023,14 +2160,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon pages. Try to detect this based on file LRU size.
*/
if (global_reclaim(sc)) {
- unsigned long zonefile;
- unsigned long zonefree;
+ unsigned long pgdatfile;
+ unsigned long pgdatfree;
+ int z;
+ unsigned long total_high_wmark = 0;
- zonefree = zone_page_state(zone, NR_FREE_PAGES);
- zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
+ pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
- if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+ if (!populated_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
scan_balance = SCAN_ANON;
goto out;
}
@@ -2045,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true) &&
+ if (!inactive_list_is_low(lruvec, true, sc) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2077,7 +2224,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
@@ -2098,7 +2245,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
@@ -2174,12 +2321,12 @@ static inline void init_tlb_ubc(void)
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
/*
- * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
+ * This is a basic per-node page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
+static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *lru_pages)
{
- struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
@@ -2287,7 +2434,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false))
+ if (inactive_list_is_low(lruvec, false, sc))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -2312,13 +2459,14 @@ static bool in_reclaim_compaction(struct scan_control *sc)
* calls try_to_compact_zone() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
-static inline bool should_continue_reclaim(struct zone *zone,
+static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long nr_reclaimed,
unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
unsigned long inactive_lru_pages;
+ int z;
/* If not in reclaim/compaction mode, stop */
if (!in_reclaim_compaction(sc))
@@ -2352,25 +2500,32 @@ static inline bool should_continue_reclaim(struct zone *zone,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
+ inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
if (get_nr_swap_pages() > 0)
- inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
+ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
/* If compaction would go ahead or the allocation would succeed, stop */
- switch (compaction_suitable(zone, sc->order, 0, 0)) {
- case COMPACT_PARTIAL:
- case COMPACT_CONTINUE:
- return false;
- default:
- return true;
+ for (z = 0; z <= sc->reclaim_idx; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+ if (!populated_zone(zone))
+ continue;
+
+ switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
+ case COMPACT_PARTIAL:
+ case COMPACT_CONTINUE:
+ return false;
+ default:
+ /* check next zone */
+ ;
+ }
}
+ return true;
}
-static bool shrink_zone(struct zone *zone, struct scan_control *sc,
- bool is_classzone)
+static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
@@ -2379,10 +2534,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
- .zone = zone,
+ .pgdat = pgdat,
.priority = sc->priority,
};
- unsigned long zone_lru_pages = 0;
+ unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
nr_reclaimed = sc->nr_reclaimed;
@@ -2403,11 +2558,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
- shrink_zone_memcg(zone, memcg, sc, &lru_pages);
- zone_lru_pages += lru_pages;
+ shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
+ node_lru_pages += lru_pages;
- if (memcg && is_classzone)
- shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+ if (!global_reclaim(sc))
+ shrink_slab(sc->gfp_mask, pgdat->node_id,
memcg, sc->nr_scanned - scanned,
lru_pages);
@@ -2419,7 +2574,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
- * zone.
+ * node.
*
* Limit reclaim, on the other hand, only cares about
* nr_to_reclaim pages to be reclaimed and it will
@@ -2437,10 +2592,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
* Shrink the slab caches in the same proportion that
* the eligible LRU pages were scanned.
*/
- if (global_reclaim(sc) && is_classzone)
- shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+ if (global_reclaim(sc))
+ shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
sc->nr_scanned - nr_scanned,
- zone_lru_pages);
+ node_lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2455,7 +2610,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
- } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
+ } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
return reclaimable;
@@ -2465,9 +2620,9 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
* Returns true if compaction should go ahead for a high-order request, or
* the high-order allocation would succeed without compaction.
*/
-static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
- unsigned long balance_gap, watermark;
+ unsigned long watermark;
bool watermark_ok;
/*
@@ -2476,23 +2631,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
- balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
- zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
- watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
- watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
+ watermark = high_wmark_pages(zone) + (2UL << sc->order);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
/*
* If compaction is deferred, reclaim up to a point where
* compaction will have a chance of success when re-enabled
*/
- if (compaction_deferred(zone, order))
+ if (compaction_deferred(zone, sc->order))
return watermark_ok;
/*
* If compaction is not ready to start and allocation is not likely
* to succeed without it, then keep reclaiming.
*/
- if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED)
+ if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
return false;
return watermark_ok;
@@ -2503,14 +2656,6 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
* try to reclaim pages from zones which will satisfy the caller's allocation
* request.
*
- * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
- * Because:
- * a) The caller may be trying to free *extra* pages to satisfy a higher-order
- * allocation or
- * b) The target zone may be at high_wmark_pages(zone) but the lower zones
- * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
- * zone defense algorithm.
- *
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
@@ -2521,7 +2666,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
gfp_t orig_mask;
- enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+ pg_data_t *last_pgdat = NULL;
/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2529,21 +2674,13 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
orig_mask = sc->gfp_mask;
- if (buffer_heads_over_limit)
+ if (buffer_heads_over_limit) {
sc->gfp_mask |= __GFP_HIGHMEM;
+ sc->reclaim_idx = gfp_zone(sc->gfp_mask);
+ }
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_zone(sc->gfp_mask), sc->nodemask) {
- enum zone_type classzone_idx;
-
- if (!populated_zone(zone))
- continue;
-
- classzone_idx = requested_highidx;
- while (!populated_zone(zone->zone_pgdat->node_zones +
- classzone_idx))
- classzone_idx--;
-
+ sc->reclaim_idx, sc->nodemask) {
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
@@ -2554,7 +2691,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
continue;
if (sc->priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
+ !pgdat_reclaimable(zone->zone_pgdat))
continue; /* Let kswapd poll it */
/*
@@ -2568,20 +2705,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
*/
if (IS_ENABLED(CONFIG_COMPACTION) &&
sc->order > PAGE_ALLOC_COSTLY_ORDER &&
- zonelist_zone_idx(z) <= requested_highidx &&
- compaction_ready(zone, sc->order, requested_highidx)) {
+ compaction_ready(zone, sc)) {
sc->compaction_ready = true;
continue;
}
/*
+ * Shrink each node in the zonelist once. If the
+ * zonelist is ordered by zone (not the default) then a
+ * node may be shrunk multiple times but in that case
+ * the user prefers lower zones being preserved.
+ */
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+
+ /*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
* scanned pages. This works for global memory pressure
* and balancing, not for a memcg's limit.
*/
nr_soft_scanned = 0;
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
sc->order, sc->gfp_mask,
&nr_soft_scanned);
sc->nr_reclaimed += nr_soft_reclaimed;
@@ -2589,7 +2734,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
/* need some check for avoid more shrink_zone() */
}
- shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
+ /* See comment about same check for global reclaim above */
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+ last_pgdat = zone->zone_pgdat;
+ shrink_node(zone->zone_pgdat, sc);
}
/*
@@ -2625,7 +2774,7 @@ retry:
delayacct_freepages_start();
if (global_reclaim(sc))
- count_vm_event(ALLOCSTALL);
+ __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do {
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2692,7 +2841,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
if (!populated_zone(zone) ||
- zone_reclaimable_pages(zone) == 0)
+ pgdat_reclaimable_pages(pgdat) == 0)
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2707,7 +2856,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
- pgdat->classzone_idx = min(pgdat->classzone_idx,
+ pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
(enum zone_type)ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -2815,6 +2964,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
.priority = DEF_PRIORITY,
@@ -2833,7 +2983,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
- gfp_mask);
+ gfp_mask,
+ sc.reclaim_idx);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -2844,9 +2995,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#ifdef CONFIG_MEMCG
-unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
+unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
- struct zone *zone,
+ pg_data_t *pgdat,
unsigned long *nr_scanned)
{
struct scan_control sc = {
@@ -2854,6 +3005,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.target_mem_cgroup = memcg,
.may_writepage = !laptop_mode,
.may_unmap = 1,
+ .reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
};
unsigned long lru_pages;
@@ -2863,16 +3015,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
sc.may_writepage,
- sc.gfp_mask);
+ sc.gfp_mask,
+ sc.reclaim_idx);
/*
* NOTE: Although we can get the priority field, using it
* here is not a good idea, since it limits the pages we can scan.
- * if we don't reclaim here, the shrink_zone from balance_pgdat
+ * if we don't reclaim here, the shrink_node from balance_pgdat
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
+ shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2892,6 +3045,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ .reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
@@ -2910,7 +3064,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
- sc.gfp_mask);
+ sc.gfp_mask,
+ sc.reclaim_idx);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -2920,7 +3075,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
}
#endif
-static void age_active_anon(struct zone *zone, struct scan_control *sc)
+static void age_active_anon(struct pglist_data *pgdat,
+ struct scan_control *sc)
{
struct mem_cgroup *memcg;
@@ -2929,9 +3085,9 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false))
+ if (inactive_list_is_low(lruvec, false, sc))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -2939,82 +3095,21 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order, bool highorder,
- unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
{
- unsigned long mark = high_wmark_pages(zone) + balance_gap;
+ unsigned long mark = high_wmark_pages(zone);
+
+ if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+ return false;
/*
- * When checking from pgdat_balanced(), kswapd should stop and sleep
- * when it reaches the high order-0 watermark and let kcompactd take
- * over. Other callers such as wakeup_kswapd() want to determine the
- * true high-order watermark.
+ * If any eligible zone is balanced then the node is not considered
+ * to be congested or dirty
*/
- if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
- mark += (1UL << order);
- order = 0;
- }
-
- return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
-}
-
-/*
- * pgdat_balanced() is used when checking if a node is balanced.
- *
- * For order-0, all zones must be balanced!
- *
- * For high-order allocations only zones that meet watermarks and are in a
- * zone allowed by the callers classzone_idx are added to balanced_pages. The
- * total of balanced pages must be at least 25% of the zones allowed by
- * classzone_idx for the node to be considered balanced. Forcing all zones to
- * be balanced for high orders can cause excessive reclaim when there are
- * imbalanced zones.
- * The choice of 25% is due to
- * o a 16M DMA zone that is balanced will not balance a zone on any
- * reasonable sized machine
- * o On all other machines, the top zone must be at least a reasonable
- * percentage of the middle zones. For example, on 32-bit x86, highmem
- * would need to be at least 256M for it to be balance a whole node.
- * Similarly, on x86-64 the Normal zone would need to be at least 1G
- * to balance a node on its own. These seemed like reasonable ratios.
- */
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
-{
- unsigned long managed_pages = 0;
- unsigned long balanced_pages = 0;
- int i;
-
- /* Check the watermark levels */
- for (i = 0; i <= classzone_idx; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- managed_pages += zone->managed_pages;
-
- /*
- * A special case here:
- *
- * balance_pgdat() skips over all_unreclaimable after
- * DEF_PRIORITY. Effectively, it considers them balanced so
- * they must be considered balanced here as well!
- */
- if (!zone_reclaimable(zone)) {
- balanced_pages += zone->managed_pages;
- continue;
- }
+ clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
+ clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
- if (zone_balanced(zone, order, false, 0, i))
- balanced_pages += zone->managed_pages;
- else if (!order)
- return false;
- }
-
- if (order)
- return balanced_pages >= (managed_pages >> 2);
- else
- return true;
+ return true;
}
/*
@@ -3023,12 +3118,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
*
* Returns true if kswapd is ready to sleep
*/
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
- int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
- /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
- if (remaining)
- return false;
+ int i;
/*
* The throttled processes are normally woken up in balance_pgdat() as
@@ -3046,91 +3138,81 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
- return pgdat_balanced(pgdat, order, classzone_idx);
+ for (i = 0; i <= classzone_idx; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (!zone_balanced(zone, order, classzone_idx))
+ return false;
+ }
+
+ return true;
}
/*
- * kswapd shrinks the zone by the number of pages required to reach
- * the high watermark.
+ * kswapd shrinks a node of pages that are at or below the highest usable
+ * zone that is currently unbalanced.
*
* Returns true if kswapd scanned at least the requested number of pages to
* reclaim or if the lack of progress was due to pages under writeback.
* This is used to determine if the scanning priority needs to be raised.
*/
-static bool kswapd_shrink_zone(struct zone *zone,
- int classzone_idx,
+static bool kswapd_shrink_node(pg_data_t *pgdat,
struct scan_control *sc)
{
- unsigned long balance_gap;
- bool lowmem_pressure;
+ struct zone *zone;
+ int z;
- /* Reclaim above the high watermark. */
- sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+ /* Reclaim a number of pages proportional to the number of zones */
+ sc->nr_to_reclaim = 0;
+ for (z = 0; z <= sc->reclaim_idx; z++) {
+ zone = pgdat->node_zones + z;
+ if (!populated_zone(zone))
+ continue;
- /*
- * We put equal pressure on every zone, unless one zone has way too
- * many pages free already. The "too many pages" is defined as the
- * high wmark plus a "gap" where the gap is either the low
- * watermark or 1% of the zone, whichever is smaller.
- */
- balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
- zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
+ sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
+ }
/*
- * If there is no low memory pressure or the zone is balanced then no
- * reclaim is necessary
+ * Historically care was taken to put equal pressure on all zones but
+ * now pressure is applied based on node LRU order.
*/
- lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
- balance_gap, classzone_idx))
- return true;
-
- shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
-
- clear_bit(ZONE_WRITEBACK, &zone->flags);
+ shrink_node(pgdat, sc);
/*
- * If a zone reaches its high watermark, consider it to be no longer
- * congested. It's possible there are dirty pages backed by congested
- * BDIs but as pressure is relieved, speculatively avoid congestion
- * waits.
+ * Fragmentation may mean that the system cannot be rebalanced for
+ * high-order allocations. If twice the allocation size has been
+ * reclaimed then recheck watermarks only at order-0 to prevent
+ * excessive reclaim. Assume that a process requested a high-order
+ * can direct reclaim/compact.
*/
- if (zone_reclaimable(zone) &&
- zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
- clear_bit(ZONE_CONGESTED, &zone->flags);
- clear_bit(ZONE_DIRTY, &zone->flags);
- }
+ if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
+ sc->order = 0;
return sc->nr_scanned >= sc->nr_to_reclaim;
}
/*
- * For kswapd, balance_pgdat() will work across all this node's zones until
- * they are all at high_wmark_pages(zone).
+ * For kswapd, balance_pgdat() will reclaim pages across a node from zones
+ * that are eligible for use by the caller until at least one zone is
+ * balanced.
*
- * Returns the highest zone idx kswapd was reclaiming at
- *
- * There is special handling here for zones which are full of pinned pages.
- * This can happen if the pages are all mlocked, or if they are all used by
- * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
- * What we do is to detect the case where all pages in the zone have been
- * scanned twice and there has been zero successful reclaim. Mark the zone as
- * dead and from now on, only perform a short scan. Basically we're polling
- * the zone for when the problem goes away.
+ * Returns the order kswapd finished reclaiming at.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
- * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
- * lower zones regardless of the number of free pages in the lower zones. This
- * interoperates with the page allocator fallback scheme to ensure that aging
- * of pages is balanced across the zones.
+ * found to have free_pages <= high_wmark_pages(zone), any page is that zone
+ * or lower is eligible for reclaim until at least one usable zone is
+ * balanced.
*/
static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
@@ -3145,100 +3227,77 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
bool raise_priority = true;
sc.nr_reclaimed = 0;
+ sc.reclaim_idx = classzone_idx;
/*
- * Scan in the highmem->dma direction for the highest
- * zone which needs scanning
+ * If the number of buffer_heads exceeds the maximum allowed
+ * then consider reclaiming from all zones. This has a dual
+ * purpose -- on 64-bit systems it is expected that
+ * buffer_heads are stripped during active rotation. On 32-bit
+ * systems, highmem pages can pin lowmem memory and shrinking
+ * buffers can relieve lowmem pressure. Reclaim may still not
+ * go ahead if all eligible zones for the original allocation
+ * request are balanced to avoid excessive reclaim from kswapd.
*/
- for (i = pgdat->nr_zones - 1; i >= 0; i--) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- if (sc.priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
- continue;
-
- /*
- * Do some background aging of the anon list, to give
- * pages a chance to be referenced before reclaiming.
- */
- age_active_anon(zone, &sc);
+ if (buffer_heads_over_limit) {
+ for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!populated_zone(zone))
+ continue;
- /*
- * If the number of buffer_heads in the machine
- * exceeds the maximum allowed level and this node
- * has a highmem zone, force kswapd to reclaim from
- * it to relieve lowmem pressure.
- */
- if (buffer_heads_over_limit && is_highmem_idx(i)) {
- end_zone = i;
+ sc.reclaim_idx = i;
break;
}
+ }
- if (!zone_balanced(zone, order, false, 0, 0)) {
- end_zone = i;
- break;
- } else {
- /*
- * If balanced, clear the dirty and congested
- * flags
- */
- clear_bit(ZONE_CONGESTED, &zone->flags);
- clear_bit(ZONE_DIRTY, &zone->flags);
- }
+ /*
+ * Only reclaim if there are no eligible zones. Check from
+ * high to low zone as allocations prefer higher zones.
+ * Scanning from low to high zone would allow congestion to be
+ * cleared during a very small window when a small low
+ * zone was balanced even under extreme pressure when the
+ * overall node may be congested. Note that sc.reclaim_idx
+ * is not used as buffer_heads_over_limit may have adjusted
+ * it.
+ */
+ for (i = classzone_idx; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_balanced(zone, sc.order, classzone_idx))
+ goto out;
}
- if (i < 0)
- goto out;
+ /*
+ * Do some background aging of the anon list, to give
+ * pages a chance to be referenced before reclaiming. All
+ * pages are rotated regardless of classzone as this is
+ * about consistent aging.
+ */
+ age_active_anon(pgdat, &sc);
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
- if (sc.priority < DEF_PRIORITY - 2)
+ if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
sc.may_writepage = 1;
+ /* Call soft limit reclaim before calling shrink_node. */
+ sc.nr_scanned = 0;
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
+ sc.gfp_mask, &nr_soft_scanned);
+ sc.nr_reclaimed += nr_soft_reclaimed;
+
/*
- * Now scan the zone in the dma->highmem direction, stopping
- * at the last zone which needs scanning.
- *
- * We do this because the page allocator works in the opposite
- * direction. This prevents the page allocator from allocating
- * pages behind kswapd's direction of progress, which would
- * cause too much scanning of the lower zones.
+ * There should be no need to raise the scanning priority if
+ * enough pages are already being scanned that that high
+ * watermark would be met at 100% efficiency.
*/
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- if (sc.priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
- continue;
-
- sc.nr_scanned = 0;
-
- nr_soft_scanned = 0;
- /*
- * Call soft limit reclaim before calling shrink_zone.
- */
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
- order, sc.gfp_mask,
- &nr_soft_scanned);
- sc.nr_reclaimed += nr_soft_reclaimed;
-
- /*
- * There should be no need to raise the scanning
- * priority if enough pages are already being scanned
- * that that high watermark would be met at 100%
- * efficiency.
- */
- if (kswapd_shrink_zone(zone, end_zone, &sc))
- raise_priority = false;
- }
+ if (kswapd_shrink_node(pgdat, &sc))
+ raise_priority = false;
/*
* If the low watermark is met there is no need for processes
@@ -3259,19 +3318,20 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
- } while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, classzone_idx));
+ } while (sc.priority >= 1);
out:
/*
- * Return the highest zone idx we were reclaiming at so
- * prepare_kswapd_sleep() makes the same decisions as here.
+ * Return the order kswapd stopped reclaiming at as
+ * prepare_kswapd_sleep() takes it into account. If another caller
+ * entered the allocator slow path while kswapd was awake, order will
+ * remain at the higher level.
*/
- return end_zone;
+ return sc.order;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
- int classzone_idx, int balanced_classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
+ unsigned int classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3282,8 +3342,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (prepare_kswapd_sleep(pgdat, order, remaining,
- balanced_classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
/*
* Compaction records what page blocks it recently failed to
* isolate pages from and skips them in the future scanning.
@@ -3296,9 +3355,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
* We have freed the memory, now we should compact it to make
* allocation of the requested order possible.
*/
- wakeup_kcompactd(pgdat, order, classzone_idx);
+ wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
remaining = schedule_timeout(HZ/10);
+
+ /*
+ * If woken prematurely then reset kswapd_classzone_idx and
+ * order. The values will either be from a wakeup request or
+ * the previous request that slept prematurely.
+ */
+ if (remaining) {
+ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
+ }
+
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
}
@@ -3307,8 +3377,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (prepare_kswapd_sleep(pgdat, order, remaining,
- balanced_classzone_idx)) {
+ if (!remaining &&
+ prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3349,9 +3419,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
*/
static int kswapd(void *p)
{
- unsigned long order, new_order;
- int classzone_idx, new_classzone_idx;
- int balanced_classzone_idx;
+ unsigned int alloc_order, reclaim_order, classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -3381,38 +3449,20 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- order = new_order = 0;
- classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
- balanced_classzone_idx = classzone_idx;
+ pgdat->kswapd_order = alloc_order = reclaim_order = 0;
+ pgdat->kswapd_classzone_idx = classzone_idx = 0;
for ( ; ; ) {
bool ret;
- /*
- * While we were reclaiming, there might have been another
- * wakeup, so check the values.
- */
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = pgdat->nr_zones - 1;
+kswapd_try_sleep:
+ kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
+ classzone_idx);
- if (order < new_order || classzone_idx > new_classzone_idx) {
- /*
- * Don't sleep if someone wants a larger 'order'
- * allocation or has tigher zone constraints
- */
- order = new_order;
- classzone_idx = new_classzone_idx;
- } else {
- kswapd_try_to_sleep(pgdat, order, classzone_idx,
- balanced_classzone_idx);
- order = pgdat->kswapd_max_order;
- classzone_idx = pgdat->classzone_idx;
- new_order = order;
- new_classzone_idx = classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = pgdat->nr_zones - 1;
- }
+ /* Read the new order and classzone_idx */
+ alloc_order = reclaim_order = pgdat->kswapd_order;
+ classzone_idx = pgdat->kswapd_classzone_idx;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_classzone_idx = 0;
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3422,11 +3472,25 @@ static int kswapd(void *p)
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (!ret) {
- trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balanced_classzone_idx = balance_pgdat(pgdat, order,
- classzone_idx);
- }
+ if (ret)
+ continue;
+
+ /*
+ * Reclaim begins at the requested order but if a high-order
+ * reclaim fails then kswapd falls back to reclaiming for
+ * order-0. If that happens, kswapd will consider sleeping
+ * for the order it finished reclaiming at (reclaim_order)
+ * but kcompactd is woken to compact for the original
+ * request (alloc_order).
+ */
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+ alloc_order);
+ reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+ if (reclaim_order < alloc_order)
+ goto kswapd_try_sleep;
+
+ alloc_order = reclaim_order = pgdat->kswapd_order;
+ classzone_idx = pgdat->kswapd_classzone_idx;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3442,6 +3506,7 @@ static int kswapd(void *p)
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
+ int z;
if (!populated_zone(zone))
return;
@@ -3449,14 +3514,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
- if (pgdat->kswapd_max_order < order) {
- pgdat->kswapd_max_order = order;
- pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
- }
+ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, true, 0, 0))
- return;
+
+ /* Only wake kswapd if all zones are unbalanced */
+ for (z = 0; z <= classzone_idx; z++) {
+ zone = pgdat->node_zones + z;
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_balanced(zone, order, classzone_idx))
+ return;
+ }
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
@@ -3477,6 +3548,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
+ .reclaim_idx = MAX_NR_ZONES - 1,
.priority = DEF_PRIORITY,
.may_writepage = 1,
.may_unmap = 1,
@@ -3578,12 +3650,12 @@ module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
- * Zone reclaim mode
+ * Node reclaim mode
*
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
* the watermarks.
*/
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
@@ -3591,14 +3663,14 @@ int zone_reclaim_mode __read_mostly;
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
/*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
@@ -3609,11 +3681,11 @@ int sysctl_min_unmapped_ratio = 1;
*/
int sysctl_min_slab_ratio = 5;
-static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
+static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
{
- unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
- unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
- zone_page_state(zone, NR_ACTIVE_FILE);
+ unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
+ unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
+ node_page_state(pgdat, NR_ACTIVE_FILE);
/*
* It's possible for there to be more file mapped pages than
@@ -3624,7 +3696,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
{
unsigned long nr_pagecache_reclaimable;
unsigned long delta = 0;
@@ -3632,17 +3704,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
/*
* If RECLAIM_UNMAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
- * pages like swapcache and zone_unmapped_file_pages() provides
+ * pages like swapcache and node_unmapped_file_pages() provides
* a better estimate
*/
- if (zone_reclaim_mode & RECLAIM_UNMAP)
- nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
+ if (node_reclaim_mode & RECLAIM_UNMAP)
+ nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
else
- nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
+ nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
/* If we can't clean pages, remove dirty pages from consideration */
- if (!(zone_reclaim_mode & RECLAIM_WRITE))
- delta += zone_page_state(zone, NR_FILE_DIRTY);
+ if (!(node_reclaim_mode & RECLAIM_WRITE))
+ delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3652,22 +3724,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
}
/*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
*/
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
+ int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.order = order,
- .priority = ZONE_RECLAIM_PRIORITY,
- .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
+ .reclaim_idx = classzone_idx,
};
cond_resched();
@@ -3681,13 +3755,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+ if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
- shrink_zone(zone, &sc, true);
+ shrink_node(pgdat, &sc);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
@@ -3697,49 +3771,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
return sc.nr_reclaimed >= nr_pages;
}
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
- int node_id;
int ret;
/*
- * Zone reclaim reclaims unmapped file backed pages and
+ * Node reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.
*
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
- * thrown out if the zone is overallocated. So we do not reclaim
- * if less than a specified percentage of the zone is used by
+ * thrown out if the node is overallocated. So we do not reclaim
+ * if less than a specified percentage of the node is used by
* unmapped file backed pages.
*/
- if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
- return ZONE_RECLAIM_FULL;
+ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+ sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ return NODE_RECLAIM_FULL;
- if (!zone_reclaimable(zone))
- return ZONE_RECLAIM_FULL;
+ if (!pgdat_reclaimable(pgdat))
+ return NODE_RECLAIM_FULL;
/*
* Do not scan if the allocation should not be delayed.
*/
if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
- return ZONE_RECLAIM_NOSCAN;
+ return NODE_RECLAIM_NOSCAN;
/*
- * Only run zone reclaim on the local zone or on zones that do not
+ * Only run node reclaim on the local node or on nodes that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
- node_id = zone_to_nid(zone);
- if (node_state(node_id, N_CPU) && node_id != numa_node_id())
- return ZONE_RECLAIM_NOSCAN;
+ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+ return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
- return ZONE_RECLAIM_NOSCAN;
+ if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ return NODE_RECLAIM_NOSCAN;
- ret = __zone_reclaim(zone, gfp_mask, order);
- clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ ret = __node_reclaim(pgdat, gfp_mask, order);
+ clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3778,24 +3850,23 @@ int page_evictable(struct page *page)
void check_move_unevictable_pages(struct page **pages, int nr_pages)
{
struct lruvec *lruvec;
- struct zone *zone = NULL;
+ struct pglist_data *pgdat = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pages[i];
- struct zone *pagezone;
+ struct pglist_data *pagepgdat = page_pgdat(page);
pgscanned++;
- pagezone = page_zone(page);
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
+ if (pagepgdat != pgdat) {
+ if (pgdat)
+ spin_unlock_irq(&pgdat->lru_lock);
+ pgdat = pagepgdat;
+ spin_lock_irq(&pgdat->lru_lock);
}
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
if (!PageLRU(page) || !PageUnevictable(page))
continue;
@@ -3811,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
}
}
- if (zone) {
+ if (pgdat) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
}
}
#endif /* CONFIG_SHMEM */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7997f52935c9..89cec42d19ff 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu)
*
* vm_stat contains the global counters
*/
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
-EXPORT_SYMBOL(vm_stat);
+atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
+EXPORT_SYMBOL(vm_zone_stat);
+EXPORT_SYMBOL(vm_node_stat);
#ifdef CONFIG_SMP
@@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone)
*/
void refresh_zone_stat_thresholds(void)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int cpu;
int threshold;
+ /* Zero current pgdat thresholds */
+ for_each_online_pgdat(pgdat) {
+ for_each_online_cpu(cpu) {
+ per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
+ }
+ }
+
for_each_populated_zone(zone) {
+ struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long max_drift, tolerate_drift;
threshold = calculate_normal_threshold(zone);
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ int pgdat_threshold;
+
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
+ /* Base nodestat threshold on the largest populated zone. */
+ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
+ per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
+ = max(threshold, pgdat_threshold);
+ }
+
/*
* Only set percpu_drift_mark if there is a danger that
* NR_FREE_PAGES reports the low watermark is ok when in fact
@@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
}
EXPORT_SYMBOL(__mod_zone_page_state);
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+ long delta)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ long x;
+ long t;
+
+ x = delta + __this_cpu_read(*p);
+
+ t = __this_cpu_read(pcp->stat_threshold);
+
+ if (unlikely(x > t || x < -t)) {
+ node_page_state_add(x, pgdat, item);
+ x = 0;
+ }
+ __this_cpu_write(*p, x);
+}
+EXPORT_SYMBOL(__mod_node_page_state);
+
/*
* Optimized increment and decrement functions.
*
@@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
}
}
+void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ s8 v, t;
+
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
+
+ node_page_state_add(v + overstep, pgdat, item);
+ __this_cpu_write(*p, -overstep);
+ }
+}
+
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
{
__inc_zone_state(page_zone(page), item);
}
EXPORT_SYMBOL(__inc_zone_page_state);
+void __inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+ __inc_node_state(page_pgdat(page), item);
+}
+EXPORT_SYMBOL(__inc_node_page_state);
+
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
struct per_cpu_pageset __percpu *pcp = zone->pageset;
@@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
}
}
+void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ s8 v, t;
+
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
+
+ node_page_state_add(v - overstep, pgdat, item);
+ __this_cpu_write(*p, overstep);
+ }
+}
+
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
{
__dec_zone_state(page_zone(page), item);
}
EXPORT_SYMBOL(__dec_zone_page_state);
+void __dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+ __dec_node_state(page_pgdat(page), item);
+}
+EXPORT_SYMBOL(__dec_node_page_state);
+
#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
/*
* If we have cmpxchg_local support then we do not need to incur the overhead
@@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state);
* 1 Overstepping half of threshold
* -1 Overstepping minus half of threshold
*/
-static inline void mod_state(struct zone *zone, enum zone_stat_item item,
- long delta, int overstep_mode)
+static inline void mod_zone_state(struct zone *zone,
+ enum zone_stat_item item, long delta, int overstep_mode)
{
struct per_cpu_pageset __percpu *pcp = zone->pageset;
s8 __percpu *p = pcp->vm_stat_diff + item;
@@ -359,26 +442,83 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item,
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
long delta)
{
- mod_state(zone, item, delta, 0);
+ mod_zone_state(zone, item, delta, 0);
}
EXPORT_SYMBOL(mod_zone_page_state);
-void inc_zone_state(struct zone *zone, enum zone_stat_item item)
-{
- mod_state(zone, item, 1, 1);
-}
-
void inc_zone_page_state(struct page *page, enum zone_stat_item item)
{
- mod_state(page_zone(page), item, 1, 1);
+ mod_zone_state(page_zone(page), item, 1, 1);
}
EXPORT_SYMBOL(inc_zone_page_state);
void dec_zone_page_state(struct page *page, enum zone_stat_item item)
{
- mod_state(page_zone(page), item, -1, -1);
+ mod_zone_state(page_zone(page), item, -1, -1);
}
EXPORT_SYMBOL(dec_zone_page_state);
+
+static inline void mod_node_state(struct pglist_data *pgdat,
+ enum node_stat_item item, int delta, int overstep_mode)
+{
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to node counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a node.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (n > t || n < -t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to node counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ node_page_state_add(z, pgdat, item);
+}
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+ long delta)
+{
+ mod_node_state(pgdat, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ mod_node_state(pgdat, item, 1, 1);
+}
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+ mod_node_state(page_pgdat(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+ mod_node_state(page_pgdat(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_node_page_state);
#else
/*
* Use interrupt disable to serialize counter updates
@@ -394,15 +534,6 @@ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
}
EXPORT_SYMBOL(mod_zone_page_state);
-void inc_zone_state(struct zone *zone, enum zone_stat_item item)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __inc_zone_state(zone, item);
- local_irq_restore(flags);
-}
-
void inc_zone_page_state(struct page *page, enum zone_stat_item item)
{
unsigned long flags;
@@ -424,21 +555,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
-#endif
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __inc_node_state(pgdat, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_node_state);
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+ long delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_node_page_state(pgdat, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+ unsigned long flags;
+ struct pglist_data *pgdat;
+
+ pgdat = page_pgdat(page);
+ local_irq_save(flags);
+ __inc_node_state(pgdat, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __dec_node_page_state(page, item);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_node_page_state);
+#endif
/*
* Fold a differential into the global counters.
* Returns the number of counters updated.
*/
-static int fold_diff(int *diff)
+static int fold_diff(int *zone_diff, int *node_diff)
{
int i;
int changes = 0;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (diff[i]) {
- atomic_long_add(diff[i], &vm_stat[i]);
+ if (zone_diff[i]) {
+ atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+ changes++;
+ }
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ if (node_diff[i]) {
+ atomic_long_add(node_diff[i], &vm_node_stat[i]);
changes++;
}
return changes;
@@ -462,9 +641,11 @@ static int fold_diff(int *diff)
*/
static int refresh_cpu_vm_stats(bool do_pagesets)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int i;
- int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
int changes = 0;
for_each_populated_zone(zone) {
@@ -477,7 +658,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
if (v) {
atomic_long_add(v, &zone->vm_stat[i]);
- global_diff[i] += v;
+ global_zone_diff[i] += v;
#ifdef CONFIG_NUMA
/* 3 seconds idle till flush */
__this_cpu_write(p->expire, 3);
@@ -516,7 +697,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
}
#endif
}
- changes += fold_diff(global_diff);
+
+ for_each_online_pgdat(pgdat) {
+ struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ int v;
+
+ v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
+ if (v) {
+ atomic_long_add(v, &pgdat->vm_stat[i]);
+ global_node_diff[i] += v;
+ }
+ }
+ }
+
+ changes += fold_diff(global_zone_diff, global_node_diff);
return changes;
}
@@ -527,9 +723,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
*/
void cpu_vm_stats_fold(int cpu)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int i;
- int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
for_each_populated_zone(zone) {
struct per_cpu_pageset *p;
@@ -543,11 +741,27 @@ void cpu_vm_stats_fold(int cpu)
v = p->vm_stat_diff[i];
p->vm_stat_diff[i] = 0;
atomic_long_add(v, &zone->vm_stat[i]);
- global_diff[i] += v;
+ global_zone_diff[i] += v;
}
}
- fold_diff(global_diff);
+ for_each_online_pgdat(pgdat) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ if (p->vm_node_stat_diff[i]) {
+ int v;
+
+ v = p->vm_node_stat_diff[i];
+ p->vm_node_stat_diff[i] = 0;
+ atomic_long_add(v, &pgdat->vm_stat[i]);
+ global_node_diff[i] += v;
+ }
+ }
+
+ fold_diff(global_zone_diff, global_node_diff);
}
/*
@@ -563,16 +777,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
int v = pset->vm_stat_diff[i];
pset->vm_stat_diff[i] = 0;
atomic_long_add(v, &zone->vm_stat[i]);
- atomic_long_add(v, &vm_stat[i]);
+ atomic_long_add(v, &vm_zone_stat[i]);
}
}
#endif
#ifdef CONFIG_NUMA
/*
- * Determine the per node value of a stat item.
+ * Determine the per node value of a stat item. This function
+ * is called frequently in a NUMA machine, so try to be as
+ * frugal as possible.
*/
-unsigned long node_page_state(int node, enum zone_stat_item item)
+unsigned long sum_zone_node_page_state(int node,
+ enum zone_stat_item item)
{
struct zone *zones = NODE_DATA(node)->node_zones;
int i;
@@ -584,6 +801,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item)
return count;
}
+/*
+ * Determine the per node value of a stat item.
+ */
+unsigned long node_page_state(struct pglist_data *pgdat,
+ enum node_stat_item item)
+{
+ long x = atomic_long_read(&pgdat->vm_stat[item]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
#endif
#ifdef CONFIG_COMPACTION
@@ -691,33 +921,18 @@ int fragmentation_index(struct zone *zone, unsigned int order)
const char * const vmstat_text[] = {
/* enum zone_stat_item countes */
"nr_free_pages",
- "nr_alloc_batch",
- "nr_inactive_anon",
- "nr_active_anon",
- "nr_inactive_file",
- "nr_active_file",
- "nr_unevictable",
+ "nr_zone_inactive_anon",
+ "nr_zone_active_anon",
+ "nr_zone_inactive_file",
+ "nr_zone_active_file",
+ "nr_zone_unevictable",
+ "nr_zone_write_pending",
"nr_mlock",
- "nr_anon_pages",
- "nr_mapped",
- "nr_file_pages",
- "nr_dirty",
- "nr_writeback",
"nr_slab_reclaimable",
"nr_slab_unreclaimable",
"nr_page_table_pages",
"nr_kernel_stack",
- "nr_unstable",
"nr_bounce",
- "nr_vmscan_write",
- "nr_vmscan_immediate_reclaim",
- "nr_writeback_temp",
- "nr_isolated_anon",
- "nr_isolated_file",
- "nr_shmem",
- "nr_dirtied",
- "nr_written",
- "nr_pages_scanned",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
#endif
@@ -729,13 +944,35 @@ const char * const vmstat_text[] = {
"numa_local",
"numa_other",
#endif
+ "nr_free_cma",
+
+ /* Node-based counters */
+ "nr_inactive_anon",
+ "nr_active_anon",
+ "nr_inactive_file",
+ "nr_active_file",
+ "nr_unevictable",
+ "nr_isolated_anon",
+ "nr_isolated_file",
+ "nr_pages_scanned",
"workingset_refault",
"workingset_activate",
"workingset_nodereclaim",
- "nr_anon_transparent_hugepages",
+ "nr_anon_pages",
+ "nr_mapped",
+ "nr_file_pages",
+ "nr_dirty",
+ "nr_writeback",
+ "nr_writeback_temp",
+ "nr_shmem",
"nr_shmem_hugepages",
"nr_shmem_pmdmapped",
- "nr_free_cma",
+ "nr_anon_transparent_hugepages",
+ "nr_unstable",
+ "nr_vmscan_write",
+ "nr_vmscan_immediate_reclaim",
+ "nr_dirtied",
+ "nr_written",
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -749,6 +986,8 @@ const char * const vmstat_text[] = {
"pswpout",
TEXTS_FOR_ZONES("pgalloc")
+ TEXTS_FOR_ZONES("allocstall")
+ TEXTS_FOR_ZONES("pgskip")
"pgfree",
"pgactivate",
@@ -758,11 +997,11 @@ const char * const vmstat_text[] = {
"pgmajfault",
"pglazyfreed",
- TEXTS_FOR_ZONES("pgrefill")
- TEXTS_FOR_ZONES("pgsteal_kswapd")
- TEXTS_FOR_ZONES("pgsteal_direct")
- TEXTS_FOR_ZONES("pgscan_kswapd")
- TEXTS_FOR_ZONES("pgscan_direct")
+ "pgrefill",
+ "pgsteal_kswapd",
+ "pgsteal_direct",
+ "pgscan_kswapd",
+ "pgscan_direct",
"pgscan_direct_throttle",
#ifdef CONFIG_NUMA
@@ -774,7 +1013,6 @@ const char * const vmstat_text[] = {
"kswapd_low_wmark_hit_quickly",
"kswapd_high_wmark_hit_quickly",
"pageoutrun",
- "allocstall",
"pgrotated",
@@ -1180,17 +1418,41 @@ static const struct file_operations pagetypeinfo_file_ops = {
.release = seq_release,
};
+static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
+{
+ int zid;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *compare = &pgdat->node_zones[zid];
+
+ if (populated_zone(compare))
+ return zone == compare;
+ }
+
+ /* The zone must be somewhere! */
+ WARN_ON_ONCE(1);
+ return false;
+}
+
static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
int i;
seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ if (is_zone_first_populated(pgdat, zone)) {
+ seq_printf(m, "\n per-node stats");
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ seq_printf(m, "\n %-12s %lu",
+ vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+ node_page_state(pgdat, i));
+ }
+ }
seq_printf(m,
"\n pages free %lu"
"\n min %lu"
"\n low %lu"
"\n high %lu"
- "\n scanned %lu"
+ "\n node_scanned %lu"
"\n spanned %lu"
"\n present %lu"
"\n managed %lu",
@@ -1198,13 +1460,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
- zone_page_state(zone, NR_PAGES_SCANNED),
+ node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
zone->spanned_pages,
zone->present_pages,
zone->managed_pages);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- seq_printf(m, "\n %-12s %lu", vmstat_text[i],
+ seq_printf(m, "\n %-12s %lu", vmstat_text[i],
zone_page_state(zone, i));
seq_printf(m,
@@ -1234,12 +1496,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
#endif
}
seq_printf(m,
- "\n all_unreclaimable: %u"
- "\n start_pfn: %lu"
- "\n inactive_ratio: %u",
- !zone_reclaimable(zone),
+ "\n node_unreclaimable: %u"
+ "\n start_pfn: %lu"
+ "\n node_inactive_ratio: %u",
+ !pgdat_reclaimable(zone->zone_pgdat),
zone->zone_start_pfn,
- zone->inactive_ratio);
+ zone->zone_pgdat->inactive_ratio);
seq_putc(m, '\n');
}
@@ -1287,6 +1549,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+ NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS
@@ -1301,6 +1564,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
v[i] = global_page_state(i);
v += NR_VM_ZONE_STAT_ITEMS;
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ v[i] = global_node_page_state(i);
+ v += NR_VM_NODE_STAT_ITEMS;
+
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
v + NR_DIRTY_THRESHOLD);
v += NR_VM_WRITEBACK_STAT_ITEMS;
@@ -1325,7 +1592,6 @@ static int vmstat_show(struct seq_file *m, void *arg)
{
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
-
seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
return 0;
}
@@ -1390,13 +1656,12 @@ int vmstat_refresh(struct ctl_table *table, int write,
if (err)
return err;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
- val = atomic_long_read(&vm_stat[i]);
+ val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
switch (i) {
- case NR_ALLOC_BATCH:
case NR_PAGES_SCANNED:
/*
- * These are often seen to go negative in
+ * This is often seen to go negative in
* recent kernels, but not to go permanently
* negative. Whilst it would be nicer not to
* have exceptions, rooting them out would be
diff --git a/mm/workingset.c b/mm/workingset.c
index 577277546d98..69551cfae97b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -16,7 +16,7 @@
/*
* Double CLOCK lists
*
- * Per zone, two clock lists are maintained for file pages: the
+ * Per node, two clock lists are maintained for file pages: the
* inactive and the active list. Freshly faulted pages start out at
* the head of the inactive list and page reclaim scans pages from the
* tail. Pages that are accessed multiple times on the inactive list
@@ -141,11 +141,11 @@
*
* Implementation
*
- * For each zone's file LRU lists, a counter for inactive evictions
- * and activations is maintained (zone->inactive_age).
+ * For each node's file LRU lists, a counter for inactive evictions
+ * and activations is maintained (node->inactive_age).
*
* On eviction, a snapshot of this counter (along with some bits to
- * identify the zone) is stored in the now empty page cache radix tree
+ * identify the node) is stored in the now empty page cache radix tree
* slot of the evicted page. This is called a shadow entry.
*
* On cache misses for which there are shadow entries, an eligible
@@ -153,7 +153,7 @@
*/
#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
- ZONES_SHIFT + NODES_SHIFT + \
+ NODES_SHIFT + \
MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
@@ -167,33 +167,30 @@
*/
static unsigned int bucket_order __read_mostly;
-static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
{
eviction >>= bucket_order;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
- eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
- eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
+ eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
-static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
+static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
unsigned long *evictionp)
{
unsigned long entry = (unsigned long)shadow;
- int memcgid, nid, zid;
+ int memcgid, nid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
- zid = entry & ((1UL << ZONES_SHIFT) - 1);
- entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
entry >>= MEM_CGROUP_ID_SHIFT;
*memcgidp = memcgid;
- *zonep = NODE_DATA(nid)->node_zones + zid;
+ *pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
}
@@ -208,7 +205,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
struct mem_cgroup *memcg = page_memcg(page);
- struct zone *zone = page_zone(page);
+ struct pglist_data *pgdat = page_pgdat(page);
int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
struct lruvec *lruvec;
@@ -218,9 +215,9 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
eviction = atomic_long_inc_return(&lruvec->inactive_age);
- return pack_shadow(memcgid, zone, eviction);
+ return pack_shadow(memcgid, pgdat, eviction);
}
/**
@@ -228,7 +225,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
- * evicted page in the context of the zone it was allocated in.
+ * evicted page in the context of the node it was allocated in.
*
* Returns %true if the page should be activated, %false otherwise.
*/
@@ -240,10 +237,10 @@ bool workingset_refault(void *shadow)
unsigned long eviction;
struct lruvec *lruvec;
unsigned long refault;
- struct zone *zone;
+ struct pglist_data *pgdat;
int memcgid;
- unpack_shadow(shadow, &memcgid, &zone, &eviction);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
rcu_read_lock();
/*
@@ -267,7 +264,7 @@ bool workingset_refault(void *shadow)
rcu_read_unlock();
return false;
}
- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
rcu_read_unlock();
@@ -290,10 +287,10 @@ bool workingset_refault(void *shadow)
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
- inc_zone_state(zone, WORKINGSET_REFAULT);
+ inc_node_state(pgdat, WORKINGSET_REFAULT);
if (refault_distance <= active_file) {
- inc_zone_state(zone, WORKINGSET_ACTIVATE);
+ inc_node_state(pgdat, WORKINGSET_ACTIVATE);
return true;
}
return false;
@@ -305,9 +302,10 @@ bool workingset_refault(void *shadow)
*/
void workingset_activation(struct page *page)
{
+ struct mem_cgroup *memcg;
struct lruvec *lruvec;
- lock_page_memcg(page);
+ rcu_read_lock();
/*
* Filter non-memcg pages here, e.g. unmap can call
* mark_page_accessed() on VDSO pages.
@@ -315,12 +313,13 @@ void workingset_activation(struct page *page)
* XXX: See workingset_refault() - this should return
* root_mem_cgroup even for !CONFIG_MEMCG.
*/
- if (!mem_cgroup_disabled() && !page_memcg(page))
+ memcg = page_memcg_rcu(page);
+ if (!mem_cgroup_disabled() && !memcg)
goto out;
- lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+ lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
atomic_long_inc(&lruvec->inactive_age);
out:
- unlock_page_memcg(page);
+ rcu_read_unlock();
}
/*
@@ -349,12 +348,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
- if (memcg_kmem_enabled())
+ if (memcg_kmem_enabled()) {
pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
LRU_ALL_FILE);
- else
- pages = node_page_state(sc->nid, NR_ACTIVE_FILE) +
- node_page_state(sc->nid, NR_INACTIVE_FILE);
+ } else {
+ pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
+ node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
+ }
/*
* Active cache pages are limited to 50% of memory, and shadow
@@ -433,7 +433,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
}
BUG_ON(node->count);
- inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+ inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
if (!__radix_tree_delete_node(&mapping->page_tree, node))
BUG();
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 04176de6df70..b0bc023d25c5 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -20,6 +20,7 @@
* page->freelist(index): links together all component pages of a zspage
* For the huge page, this is always 0, so we use this field
* to store handle.
+ * page->units: first object offset in a subpage of zspage
*
* Usage of struct page flags:
* PG_private: identifies the first component page
@@ -137,9 +138,6 @@
*/
#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
-/*
- * We do not maintain any list for completely empty or full pages
- */
enum fullness_group {
ZS_EMPTY,
ZS_ALMOST_EMPTY,
@@ -467,11 +465,6 @@ static struct zpool_driver zs_zpool_driver = {
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
- return pages_per_zspage * PAGE_SIZE / size;
-}
-
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -635,8 +628,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
freeable = zs_can_compact(class);
spin_unlock(&class->lock);
- objs_per_zspage = get_maxobj_per_zspage(class->size,
- class->pages_per_zspage);
+ objs_per_zspage = class->objs_per_zspage;
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
@@ -945,8 +937,8 @@ static void unpin_tag(unsigned long handle)
static void reset_page(struct page *page)
{
__ClearPageMovable(page);
- clear_bit(PG_private, &page->flags);
- clear_bit(PG_private_2, &page->flags);
+ ClearPagePrivate(page);
+ ClearPagePrivate2(page);
set_page_private(page, 0);
page_mapcount_reset(page);
ClearPageHugeObject(page);
@@ -1014,8 +1006,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
cache_free_zspage(pool, zspage);
- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
- class->size, class->pages_per_zspage));
+ zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated);
}
@@ -1350,7 +1341,7 @@ static void zs_unregister_cpu_notifier(void)
cpu_notifier_register_done();
}
-static void init_zs_size_classes(void)
+static void __init init_zs_size_classes(void)
{
int nr;
@@ -1361,16 +1352,14 @@ static void init_zs_size_classes(void)
zs_size_classes = nr;
}
-static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
+static bool can_merge(struct size_class *prev, int pages_per_zspage,
+ int objs_per_zspage)
{
- if (prev->pages_per_zspage != pages_per_zspage)
- return false;
+ if (prev->pages_per_zspage == pages_per_zspage &&
+ prev->objs_per_zspage == objs_per_zspage)
+ return true;
- if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
- != get_maxobj_per_zspage(size, pages_per_zspage))
- return false;
-
- return true;
+ return false;
}
static bool zspage_full(struct size_class *class, struct zspage *zspage)
@@ -1541,6 +1530,7 @@ static unsigned long obj_malloc(struct size_class *class,
* zs_malloc - Allocate block of given size from pool.
* @pool: pool to allocate from
* @size: size of block to allocate
+ * @gfp: gfp flags when allocating object
*
* On success, handle to the allocated object is returned,
* otherwise 0.
@@ -1592,8 +1582,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
record_obj(handle, obj);
atomic_long_add(class->pages_per_zspage,
&pool->pages_allocated);
- zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
- class->size, class->pages_per_zspage));
+ zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
@@ -1741,10 +1730,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
- struct page *page, int index)
+ struct page *page, int *obj_idx)
{
unsigned long head;
int offset = 0;
+ int index = *obj_idx;
unsigned long handle = 0;
void *addr = kmap_atomic(page);
@@ -1765,6 +1755,9 @@ static unsigned long find_alloced_obj(struct size_class *class,
}
kunmap_atomic(addr);
+
+ *obj_idx = index;
+
return handle;
}
@@ -1776,7 +1769,7 @@ struct zs_compact_control {
struct page *d_page;
/* Starting object index within @s_page which used for live object
* in the subpage. */
- int index;
+ int obj_idx;
};
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1786,16 +1779,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
unsigned long handle;
struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page;
- unsigned long index = cc->index;
+ int obj_idx = cc->obj_idx;
int ret = 0;
while (1) {
- handle = find_alloced_obj(class, s_page, index);
+ handle = find_alloced_obj(class, s_page, &obj_idx);
if (!handle) {
s_page = get_next_page(s_page);
if (!s_page)
break;
- index = 0;
+ obj_idx = 0;
continue;
}
@@ -1809,7 +1802,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
used_obj = handle_to_obj(handle);
free_obj = obj_malloc(class, get_zspage(d_page), handle);
zs_object_copy(class, free_obj, used_obj);
- index++;
+ obj_idx++;
/*
* record_obj updates handle's value to free_obj and it will
* invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
@@ -1824,7 +1817,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
/* Remember last position in this iteration */
cc->s_page = s_page;
- cc->index = index;
+ cc->obj_idx = obj_idx;
return ret;
}
@@ -2181,8 +2174,7 @@ static int zs_register_migration(struct zs_pool *pool)
static void zs_unregister_migration(struct zs_pool *pool)
{
flush_work(&pool->free_work);
- if (pool->inode)
- iput(pool->inode);
+ iput(pool->inode);
}
/*
@@ -2261,8 +2253,7 @@ static unsigned long zs_can_compact(struct size_class *class)
return 0;
obj_wasted = obj_allocated - obj_used;
- obj_wasted /= get_maxobj_per_zspage(class->size,
- class->pages_per_zspage);
+ obj_wasted /= class->objs_per_zspage;
return obj_wasted * class->pages_per_zspage;
}
@@ -2279,7 +2270,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
if (!zs_can_compact(class))
break;
- cc.index = 0;
+ cc.obj_idx = 0;
cc.s_page = get_first_page(src_zspage);
while ((dst_zspage = isolate_zspage(class, false))) {
@@ -2398,7 +2389,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
/**
* zs_create_pool - Creates an allocation pool to work from.
- * @flags: allocation flags used to allocate pool metadata
+ * @name: pool name to be created
*
* This function must be called before anything when using
* the zsmalloc allocator.
@@ -2438,6 +2429,7 @@ struct zs_pool *zs_create_pool(const char *name)
for (i = zs_size_classes - 1; i >= 0; i--) {
int size;
int pages_per_zspage;
+ int objs_per_zspage;
struct size_class *class;
int fullness = 0;
@@ -2445,6 +2437,7 @@ struct zs_pool *zs_create_pool(const char *name)
if (size > ZS_MAX_ALLOC_SIZE)
size = ZS_MAX_ALLOC_SIZE;
pages_per_zspage = get_pages_per_zspage(size);
+ objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
/*
* size_class is used for normal zsmalloc operation such
@@ -2456,7 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name)
* previous size_class if possible.
*/
if (prev_class) {
- if (can_merge(prev_class, size, pages_per_zspage)) {
+ if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
pool->size_class[i] = prev_class;
continue;
}
@@ -2469,8 +2462,7 @@ struct zs_pool *zs_create_pool(const char *name)
class->size = size;
class->index = i;
class->pages_per_zspage = pages_per_zspage;
- class->objs_per_zspage = class->pages_per_zspage *
- PAGE_SIZE / class->size;
+ class->objs_per_zspage = objs_per_zspage;
spin_lock_init(&class->lock);
pool->size_class[i] = class;
for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;