diff options
author | J. Bruce Fields <bfields@redhat.com> | 2014-11-19 12:06:30 -0500 |
---|---|---|
committer | J. Bruce Fields <bfields@redhat.com> | 2014-11-19 12:06:30 -0500 |
commit | 56429e9b3be567a173bd05f5594faf8522c34d3a (patch) | |
tree | d218d430ed992cdfa42da084bf36e5aa3c2ecb26 /mm | |
parent | 5b095e99928cc13332d364f7cca7a9ca684369b4 (diff) | |
parent | 093a1468b6edb0e568be7311b8d2228d205702db (diff) | |
download | linux-56429e9b3be567a173bd05f5594faf8522c34d3a.tar.gz linux-56429e9b3be567a173bd05f5594faf8522c34d3a.tar.bz2 linux-56429e9b3be567a173bd05f5594faf8522c34d3a.zip |
merge nfs bugfixes into nfsd for-3.19 branch
In addition to nfsd bugfixes, there are some fixes in -rc5 for client
bugs that can interfere with my testing.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/balloon_compaction.c | 2 | ||||
-rw-r--r-- | mm/bootmem.c | 9 | ||||
-rw-r--r-- | mm/cma.c | 68 | ||||
-rw-r--r-- | mm/compaction.c | 21 | ||||
-rw-r--r-- | mm/huge_memory.c | 15 | ||||
-rw-r--r-- | mm/internal.h | 25 | ||||
-rw-r--r-- | mm/iov_iter.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 105 | ||||
-rw-r--r-- | mm/memory.c | 1 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 31 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/nobootmem.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 17 | ||||
-rw-r--r-- | mm/page-writeback.c | 43 | ||||
-rw-r--r-- | mm/page_alloc.c | 76 | ||||
-rw-r--r-- | mm/page_cgroup.c | 1 | ||||
-rw-r--r-- | mm/page_isolation.c | 43 | ||||
-rw-r--r-- | mm/rmap.c | 88 | ||||
-rw-r--r-- | mm/shmem.c | 36 | ||||
-rw-r--r-- | mm/slab_common.c | 14 | ||||
-rw-r--r-- | mm/truncate.c | 61 |
21 files changed, 458 insertions, 218 deletions
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index b3cbe19f71b5..fcad8322ef36 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -68,11 +68,13 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) * to be released by the balloon driver. */ if (trylock_page(page)) { +#ifdef CONFIG_BALLOON_COMPACTION if (!PagePrivate(page)) { /* raced with isolation */ unlock_page(page); continue; } +#endif spin_lock_irqsave(&b_dev_info->pages_lock, flags); balloon_page_delete(page); __count_vm_event(BALLOON_DEFLATE); diff --git a/mm/bootmem.c b/mm/bootmem.c index 8a000cebb0d7..477be696511d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -243,13 +243,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -258,8 +255,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } @@ -124,6 +124,7 @@ static int __init cma_activate_area(struct cma *cma) err: kfree(cma->bitmap); + cma->count = 0; return -EINVAL; } @@ -217,9 +218,8 @@ int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t highmem_start = __pa(high_memory); int ret = 0; - pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", - __func__, (unsigned long)size, (unsigned long)base, - (unsigned long)limit, (unsigned long)alignment); + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); @@ -244,52 +244,72 @@ int __init cma_declare_contiguous(phys_addr_t base, size = ALIGN(size, alignment); limit &= ~(alignment - 1); + if (!base) + fixed = false; + /* size should be aligned with order_per_bit */ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) return -EINVAL; /* - * adjust limit to avoid crossing low/high memory boundary for - * automatically allocated regions + * If allocating at a fixed base the request region must not cross the + * low/high memory boundary. */ - if (((limit == 0 || limit > memblock_end) && - (memblock_end - size < highmem_start && - memblock_end > highmem_start)) || - (!fixed && limit > highmem_start && limit - size < highmem_start)) { - limit = highmem_start; - } - - if (fixed && base < highmem_start && base+size > highmem_start) { + if (fixed && base < highmem_start && base + size > highmem_start) { ret = -EINVAL; - pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", - (unsigned long)base, (unsigned long)highmem_start); + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); goto err; } + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + /* Reserve memory */ - if (base && fixed) { + if (fixed) { if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { ret = -EBUSY; goto err; } } else { - phys_addr_t addr = memblock_alloc_range(size, alignment, base, - limit); + phys_addr_t addr = 0; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range(size, alignment, + highmem_start, limit); + limit = highmem_start; + } + if (!addr) { - ret = -ENOMEM; - goto err; - } else { - base = addr; + addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } } + + base = addr; } ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); if (ret) goto err; - pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, - (unsigned long)base); + pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, + &base); return 0; err: diff --git a/mm/compaction.c b/mm/compaction.c index edba18aed173..f9792ba3537c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -479,6 +479,16 @@ isolate_freepages_range(struct compact_control *cc, block_end_pfn = min(block_end_pfn, end_pfn); + /* + * pfn could pass the block_end_pfn if isolated freepage + * is more than pageblock order. In this case, we adjust + * scanning range to right one. + */ + if (pfn >= block_end_pfn) { + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + } + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) break; @@ -784,6 +794,9 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, cc->nr_migratepages = 0; break; } + + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + break; } acct_isolated(cc->zone, cc); @@ -1026,8 +1039,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } acct_isolated(zone, cc); - /* Record where migration scanner will be restarted */ - cc->migrate_pfn = low_pfn; + /* + * Record where migration scanner will be restarted. If we end up in + * the same pageblock as the free scanner, make the scanners fully + * meet so that compact_finished() terminates compaction. + */ + cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 74c78aa8bc2f..de984159cf0b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -200,7 +200,7 @@ retry: preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); - __free_page(zero_page); + __free_pages(zero_page, compound_order(zero_page)); goto retry; } @@ -232,7 +232,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct page *zero_page = xchg(&huge_zero_page, NULL); BUG_ON(zero_page == NULL); - __free_page(zero_page); + __free_pages(zero_page, compound_order(zero_page)); return HPAGE_PMD_NR; } @@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { @@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (unlikely(khugepaged_enter_vma_merge(vma))) + if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) return -ENOMEM; break; case MADV_NOHUGEPAGE: @@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm) return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { unsigned long hstart, hend; if (!vma->anon_vma) @@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); + VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma); + return khugepaged_enter(vma, vm_flags); return 0; } diff --git a/mm/internal.h b/mm/internal.h index 829304090b90..a4f90ba7068e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -108,6 +108,31 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + */ +static inline unsigned long +__find_buddy_index(unsigned long page_idx, unsigned int order) +{ + return page_idx ^ (1 << order); +} + +extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE diff --git a/mm/iov_iter.c b/mm/iov_iter.c index eafcf60f6b83..e34a3cb6aad6 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -911,9 +911,9 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) if (i->nr_segs == 1) return i->count; else if (i->type & ITER_BVEC) - return min(i->count, i->iov->iov_len - i->iov_offset); - else return min(i->count, i->bvec->bv_len - i->iov_offset); + else + return min(i->count, i->iov->iov_len - i->iov_offset); } EXPORT_SYMBOL(iov_iter_single_seg_count); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 23976fd885fd..d6ac0e33e150 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) * start move here. */ -/* for quick checking without looking up memcg */ -atomic_t memcg_moving __read_mostly; - static void mem_cgroup_start_move(struct mem_cgroup *memcg) { - atomic_inc(&memcg_moving); atomic_inc(&memcg->moving_account); synchronize_rcu(); } @@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * Now, mem_cgroup_clear_mc() may call this function with NULL. * We check NULL in callee rather than caller. */ - if (memcg) { - atomic_dec(&memcg_moving); + if (memcg) atomic_dec(&memcg->moving_account); - } } /* @@ -2204,41 +2198,52 @@ cleanup: return true; } -/* - * Used to update mapped file or writeback or other statistics. +/** + * mem_cgroup_begin_page_stat - begin a page state statistics transaction + * @page: page that is going to change accounted state + * @locked: &memcg->move_lock slowpath was taken + * @flags: IRQ-state flags for &memcg->move_lock * - * Notes: Race condition + * This function must mark the beginning of an accounted page state + * change to prevent double accounting when the page is concurrently + * being moved to another memcg: * - * Charging occurs during page instantiation, while the page is - * unmapped and locked in page migration, or while the page table is - * locked in THP migration. No race is possible. + * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * mem_cgroup_end_page_stat(memcg, locked, flags); * - * Uncharge happens to pages with zero references, no race possible. + * The RCU lock is held throughout the transaction. The fast path can + * get away without acquiring the memcg->move_lock (@locked is false) + * because page moving starts with an RCU grace period. * - * Charge moving between groups is protected by checking mm->moving - * account and taking the move_lock in the slowpath. + * The RCU lock also protects the memcg from being freed when the page + * state that is going to change is the only thing preventing the page + * from being uncharged. E.g. end-writeback clearing PageWriteback(), + * which allows migration to go ahead and uncharge the page before the + * account transaction might be complete. */ - -void __mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, + bool *locked, + unsigned long *flags) { struct mem_cgroup *memcg; struct page_cgroup *pc; + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return NULL; + pc = lookup_page_cgroup(page); again: memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - /* - * If this memory cgroup is not under account moving, we don't - * need to take move_lock_mem_cgroup(). Because we already hold - * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock(). - */ - VM_BUG_ON(!rcu_read_lock_held()); + return NULL; + + *locked = false; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; move_lock_mem_cgroup(memcg, flags); if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { @@ -2246,36 +2251,40 @@ again: goto again; } *locked = true; + + return memcg; } -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +/** + * mem_cgroup_end_page_stat - finish a page state statistics transaction + * @memcg: the memcg that was accounted against + * @locked: value received from mem_cgroup_begin_page_stat() + * @flags: value received from mem_cgroup_begin_page_stat() + */ +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags) { - struct page_cgroup *pc = lookup_page_cgroup(page); + if (memcg && locked) + move_unlock_mem_cgroup(memcg, &flags); - /* - * It's guaranteed that pc->mem_cgroup never changes while - * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_mem_cgroup(). - */ - move_unlock_mem_cgroup(pc->mem_cgroup, flags); + rcu_read_unlock(); } -void mem_cgroup_update_page_stat(struct page *page, +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val) { - struct mem_cgroup *memcg; - struct page_cgroup *pc = lookup_page_cgroup(page); - unsigned long uninitialized_var(flags); - - if (mem_cgroup_disabled()) - return; - VM_BUG_ON(!rcu_read_lock_held()); - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - this_cpu_add(memcg->stat->count[idx], val); + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); } /* diff --git a/mm/memory.c b/mm/memory.c index 1cc6bfbd872e..3e503831e042 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1147,6 +1147,7 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(!__tlb_remove_page(tlb, page))) { force_flush = 1; + addr += PAGE_SIZE; break; } continue; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 29d8693d0c61..1bf4807cb21e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,6 +31,7 @@ #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <asm/tlbflush.h> @@ -1066,6 +1067,16 @@ out: } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ +static void reset_node_present_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->present_pages = 0; + + pgdat->node_present_pages = 0; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { @@ -1096,6 +1107,21 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); + /* + * zone->managed_pages is set to an approximate value in + * free_area_init_core(), which will cause + * /sys/device/system/node/nodeX/meminfo has wrong data. + * So reset it to 0 before any memory is onlined. + */ + reset_node_managed_pages(pgdat); + + /* + * When memory is hot-added, all the memory is in offline state. So + * clear all zones' present_pages because they will be updated in + * online_pages() and offline_pages(). + */ + reset_node_present_pages(pgdat); + return pgdat; } @@ -1912,7 +1938,6 @@ void try_offline_node(int nid) unsigned long start_pfn = pgdat->node_start_pfn; unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; unsigned long pfn; - struct page *pgdat_page = virt_to_page(pgdat); int i; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { @@ -1941,10 +1966,6 @@ void try_offline_node(int nid) node_set_offline(nid); unregister_one_node(nid); - if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) - /* node data is allocated from boot memory */ - return; - /* free waittable in each zone */ for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; diff --git a/mm/mmap.c b/mm/mmap.c index 7f855206e7fb..87e82b38453c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(prev); + khugepaged_enter_vma_merge(prev, vm_flags); return prev; } @@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(area); + khugepaged_enter_vma_merge(area, vm_flags); return area; } @@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } @@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7c7ab32ee503..90b50468333e 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -145,12 +145,10 @@ static unsigned long __init free_low_memory_core_early(void) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -159,8 +157,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18f..5340f6b91312 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff24c9d83112..19ceae87522d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2116,23 +2116,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) EXPORT_SYMBOL(account_page_dirtied); /* - * Helper function for set_page_writeback family. - * - * The caller must hold mem_cgroup_begin/end_update_page_stat() lock - * while calling this function. - * See test_set_page_writeback for example. - * - * NOTE: Unlike account_page_dirtied this does not rely on being atomic - * wrt interrupts. - */ -void account_page_writeback(struct page *page) -{ - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - inc_zone_page_state(page, NR_WRITEBACK); -} -EXPORT_SYMBOL(account_page_writeback); - -/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * @@ -2344,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2369,22 +2353,23 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2410,9 +2395,11 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } else { ret = TestSetPageWriteback(page); } - if (!ret) - account_page_writeback(page); - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + if (!ret) { + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + inc_zone_page_state(page, NR_WRITEBACK); + } + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b6381..616a2c956b4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -467,29 +467,6 @@ static inline void rmv_page_order(struct page *page) } /* - * Locate the struct page for both the matching buddy in our - * pair (buddy1) and the combined O(n+1) page they form (page). - * - * 1) Any buddy B1 will have an order O twin B2 which satisfies - * the following equation: - * B2 = B1 ^ (1 << O) - * For example, if the starting buddy (buddy2) is #8 its order - * 1 buddy is #10: - * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 - * - * 2) Any buddy B will have an order O+1 parent P which - * satisfies the following equation: - * P = B & ~(1 << O) - * - * Assumption: *_mem_map is contiguous at least up to MAX_ORDER - */ -static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) -{ - return page_idx ^ (1 << order); -} - -/* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && @@ -569,6 +546,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; + int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); @@ -577,13 +555,24 @@ static inline void __free_one_page(struct page *page, return; VM_BUG_ON(migratetype == -1); + if (is_migrate_isolate(migratetype)) { + /* + * We restrict max order of merging to prevent merge + * between freepages on isolate pageblock and normal + * pageblock. Without this, pageblock isolation + * could cause incorrect freepage accounting. + */ + max_order = min(MAX_ORDER, pageblock_order + 1); + } else { + __mod_zone_freepage_state(zone, 1 << order, migratetype); + } - page_idx = pfn & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << max_order) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER-1) { + while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) @@ -594,9 +583,11 @@ static inline void __free_one_page(struct page *page, */ if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); - set_page_private(page, 0); - __mod_zone_freepage_state(zone, 1 << order, - migratetype); + set_page_private(buddy, 0); + if (!is_migrate_isolate(migratetype)) { + __mod_zone_freepage_state(zone, 1 << order, + migratetype); + } } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -715,14 +706,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); mt = get_freepage_migratetype(page); + if (unlikely(has_isolate_pageblock(zone))) + mt = get_pageblock_migratetype(page); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (likely(!is_migrate_isolate_page(page))) { - __mod_zone_page_state(zone, NR_FREE_PAGES, 1); - if (is_migrate_cma(mt)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); - } } while (--to_free && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); @@ -739,9 +728,11 @@ static void free_one_page(struct zone *zone, if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } __free_one_page(page, pfn, zone, order, migratetype); - if (unlikely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -1484,7 +1475,7 @@ void split_page(struct page *page, unsigned int order) } EXPORT_SYMBOL_GPL(split_page); -static int __isolate_free_page(struct page *page, unsigned int order) +int __isolate_free_page(struct page *page, unsigned int order) { unsigned long watermark; struct zone *zone; @@ -2252,6 +2243,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, } /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if * we're still under heavy pressure. @@ -6400,13 +6399,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", - outer_start, end); + pr_info("%s: [%lx, %lx) PFNs busy\n", + __func__, outer_start, end); ret = -EBUSY; goto done; } - /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3708264d2833..5331c2bd85a2 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -171,6 +171,7 @@ static void free_page_cgroup(void *addr) sizeof(struct page_cgroup) * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); + kmemleak_free(addr); free_pages_exact(addr, table_size); } } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index d1473b2e9481..c8778f7e208e 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -60,6 +60,7 @@ out: int migratetype = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -75,16 +76,54 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags, nr_pages; + struct page *isolated_page = NULL; + unsigned int order; + unsigned long page_idx, buddy_idx; + struct page *buddy; zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - nr_pages = move_freepages_block(zone, page, migratetype); - __mod_zone_freepage_state(zone, nr_pages, migratetype); + + /* + * Because freepage with more than pageblock_order on isolated + * pageblock is restricted to merge due to freepage counting problem, + * it is possible that there is free buddy page. + * move_freepages_block() doesn't care of merge so we need other + * approach in order to merge them. Isolation and free will make + * these pages to be merged. + */ + if (PageBuddy(page)) { + order = page_order(page); + if (order >= pageblock_order) { + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + + if (!is_migrate_isolate_page(buddy)) { + __isolate_free_page(page, order); + set_page_refcounted(page); + isolated_page = page; + } + } + } + + /* + * If we isolate freepage with more than pageblock_order, there + * should be no freepage in the range, so we could avoid costly + * pageblock scanning for freepage moving. + */ + if (!isolated_page) { + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + } set_pageblock_migratetype(page, migratetype); + zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); + if (isolated_page) + __free_pages(isolated_page, order); } static inline struct page * diff --git a/mm/rmap.c b/mm/rmap.c index 116a5053415b..19886fb2f13a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1042,15 +1042,46 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - bool locked; + struct mem_cgroup *memcg; unsigned long flags; + bool locked; - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); +} + +static void page_remove_file_rmap(struct page *page) +{ + struct mem_cgroup *memcg; + unsigned long flags; + bool locked; + + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + goto out; + + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) + goto out; + + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. + */ + __dec_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); +out: + mem_cgroup_end_page_stat(memcg, locked, flags); } /** @@ -1061,46 +1092,33 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { - bool anon = PageAnon(page); - bool locked; - unsigned long flags; - - /* - * The anon case has no mem_cgroup page_stat to update; but may - * uncharge_page() below, where the lock ordering can deadlock if - * we hold the lock against page_stat move: so avoid it on anon. - */ - if (!anon) - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + if (!PageAnon(page)) { + page_remove_file_rmap(page); + return; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; /* - * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED - * and not charged by memcg for now. - * * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and - * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (unlikely(PageHuge(page))) - goto out; - if (anon) { - if (PageTransHuge(page)) - __dec_zone_page_state(page, - NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - -hpage_nr_pages(page)); - } else { - __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); - mem_cgroup_end_update_page_stat(page, &locked, &flags); - } + if (PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + -hpage_nr_pages(page)); + if (unlikely(PageMlocked(page))) clear_page_mlock(page); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1110,10 +1128,6 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - return; -out: - if (!anon) - mem_cgroup_end_update_page_stat(page, &locked, &flags); } /* diff --git a/mm/shmem.c b/mm/shmem.c index cd6fc7590e54..185836ba53ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2345,6 +2345,32 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru return 0; } +static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) +{ + struct dentry *whiteout; + int error; + + whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); + if (!whiteout) + return -ENOMEM; + + error = shmem_mknod(old_dir, whiteout, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); + dput(whiteout); + if (error) + return error; + + /* + * Cheat and hash the whiteout while the old dentry is still in + * place, instead of playing games with FS_RENAME_DOES_D_MOVE. + * + * d_lookup() will consistently find one of them at this point, + * not sure which one, but that isn't even important. + */ + d_rehash(whiteout); + return 0; +} + /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if @@ -2356,7 +2382,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode); - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) @@ -2365,6 +2391,14 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc if (!simple_empty(new_dentry)) return -ENOTEMPTY; + if (flags & RENAME_WHITEOUT) { + int error; + + error = shmem_whiteout(old_dir, old_dentry); + if (error) + return error; + } + if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 3a6e0cfdf03a..dcdab81bd240 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -93,16 +93,6 @@ static int kmem_cache_sanity_check(const char *name, size_t size) s->object_size); continue; } - -#if !defined(CONFIG_SLUB) - if (!strcmp(s->name, name)) { - pr_err("%s (%s): Cache name already exists.\n", - __func__, name); - dump_stack(); - s = NULL; - return -EINVAL; - } -#endif } WARN_ON(strchr(name, ' ')); /* It confuses parsers */ @@ -269,6 +259,10 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, if (s->size - size >= sizeof(void *)) continue; + if (IS_ENABLED(CONFIG_SLAB) && align && + (align > s->align || s->align % align)) + continue; + return s; } return NULL; diff --git a/mm/truncate.c b/mm/truncate.c index 96d167372d89..f1e4d6052369 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ #include <linux/cleancache.h> +#include <linux/rmap.h> #include "internal.h" static void clear_exceptional_entry(struct address_space *mapping, @@ -714,17 +715,73 @@ EXPORT_SYMBOL(truncate_pagecache); * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and before all filesystem specific - * block truncation has been performed. + * Must be called with a lock serializing truncates and writes (generally + * i_mutex but e.g. xfs uses a different lock) and before all filesystem + * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { + loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + +/** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole |