From f022d8cb7ec70fe8edd56383d876001317ee76b1 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 24 Oct 2014 13:18:39 +0300 Subject: mm: cma: Don't crash on allocation if CMA area can't be activated If activation of the CMA area fails its mutex won't be initialized, leading to an oops at allocation time when trying to lock the mutex. Fix this by setting the cma area count field to 0 when activation fails, leading to allocation returning NULL immediately. Cc: # v3.17 Signed-off-by: Laurent Pinchart Acked-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski --- mm/cma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 963bc4add9af..5aa1a6f74dec 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -124,6 +124,7 @@ static int __init cma_activate_area(struct cma *cma) err: kfree(cma->bitmap); + cma->count = 0; return -EINVAL; } -- cgit v1.2.3 From 800a85d3d286604b8c539ca7ee90b992316fd2a7 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 24 Oct 2014 13:18:40 +0300 Subject: mm: cma: Always consider a 0 base address reservation as dynamic The fixed parameter to cma_declare_contiguous() tells the function whether the given base address must be honoured or should be considered as a hint only. The API considers a zero base address as meaning any base address, which must never be considered as a fixed value. Part of the implementation correctly checks both fixed and base != 0, but two locations check the fixed value only. Set fixed to false when base is 0 to fix that and simplify the code. Signed-off-by: Laurent Pinchart Acked-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski --- mm/cma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 5aa1a6f74dec..62a5dccc3fb8 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -245,6 +245,9 @@ int __init cma_declare_contiguous(phys_addr_t base, size = ALIGN(size, alignment); limit &= ~(alignment - 1); + if (!base) + fixed = false; + /* size should be aligned with order_per_bit */ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) return -EINVAL; @@ -268,7 +271,7 @@ int __init cma_declare_contiguous(phys_addr_t base, } /* Reserve memory */ - if (base && fixed) { + if (fixed) { if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { ret = -EBUSY; -- cgit v1.2.3 From 16195ddd4ebcc10c30b2f232f8e400df8d464380 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 24 Oct 2014 13:18:41 +0300 Subject: mm: cma: Ensure that reservations never cross the low/high mem boundary Commit 95b0e655f914 ("ARM: mm: don't limit default CMA region only to low memory") extended CMA memory reservation to allow usage of high memory. It relied on commit f7426b983a6a ("mm: cma: adjust address limit to avoid hitting low/high memory boundary") to ensure that the reserved block never crossed the low/high memory boundary. While the implementation correctly lowered the limit, it failed to consider the case where the base..limit range crossed the low/high memory boundary with enough space on each side to reserve the requested size on either low or high memory. Rework the base and limit adjustment to fix the problem. The function now starts by rejecting the reservation altogether for fixed reservations that cross the boundary, tries to reserve from high memory first and then falls back to low memory. Signed-off-by: Laurent Pinchart Signed-off-by: Marek Szyprowski --- mm/cma.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 62a5dccc3fb8..c30a6edee65c 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -253,23 +253,24 @@ int __init cma_declare_contiguous(phys_addr_t base, return -EINVAL; /* - * adjust limit to avoid crossing low/high memory boundary for - * automatically allocated regions + * If allocating at a fixed base the request region must not cross the + * low/high memory boundary. */ - if (((limit == 0 || limit > memblock_end) && - (memblock_end - size < highmem_start && - memblock_end > highmem_start)) || - (!fixed && limit > highmem_start && limit - size < highmem_start)) { - limit = highmem_start; - } - - if (fixed && base < highmem_start && base+size > highmem_start) { + if (fixed && base < highmem_start && base + size > highmem_start) { ret = -EINVAL; pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", (unsigned long)base, (unsigned long)highmem_start); goto err; } + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + /* Reserve memory */ if (fixed) { if (memblock_is_region_reserved(base, size) || @@ -278,14 +279,30 @@ int __init cma_declare_contiguous(phys_addr_t base, goto err; } } else { - phys_addr_t addr = memblock_alloc_range(size, alignment, base, - limit); + phys_addr_t addr = 0; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range(size, alignment, + highmem_start, limit); + limit = highmem_start; + } + if (!addr) { - ret = -ENOMEM; - goto err; - } else { - base = addr; + addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } } + + base = addr; } ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); -- cgit v1.2.3 From 56fa4f609badbe442093409c3d3a051811e54f72 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 24 Oct 2014 13:18:42 +0300 Subject: mm: cma: Use %pa to print physical addresses Casting physical addresses to unsigned long and using %lu truncates the values on systems where physical addresses are larger than 32 bits. Use %pa and get rid of the cast instead. Signed-off-by: Laurent Pinchart Acked-by: Michal Nazarewicz Acked-by: Geert Uytterhoeven Signed-off-by: Marek Szyprowski --- mm/cma.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index c30a6edee65c..fde706e1284f 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -218,9 +218,8 @@ int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t highmem_start = __pa(high_memory); int ret = 0; - pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", - __func__, (unsigned long)size, (unsigned long)base, - (unsigned long)limit, (unsigned long)alignment); + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); @@ -258,8 +257,8 @@ int __init cma_declare_contiguous(phys_addr_t base, */ if (fixed && base < highmem_start && base + size > highmem_start) { ret = -EINVAL; - pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", - (unsigned long)base, (unsigned long)highmem_start); + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); goto err; } @@ -309,8 +308,8 @@ int __init cma_declare_contiguous(phys_addr_t base, if (ret) goto err; - pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, - (unsigned long)base); + pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, + &base); return 0; err: -- cgit v1.2.3 From ce9ec37bddb633404a0c23e1acb181a264e7f7f2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 28 Oct 2014 13:16:28 -0700 Subject: zap_pte_range: update addr when forcing flush after TLB batching faiure When unmapping a range of pages in zap_pte_range, the page being unmapped is added to an mmu_gather_batch structure for asynchronous freeing. If we run out of space in the batch structure before the range has been completely unmapped, then we break out of the loop, force a TLB flush and free the pages that we have batched so far. If there are further pages to unmap, then we resume the loop where we left off. Unfortunately, we forget to update addr when we break out of the loop, which causes us to truncate the range being invalidated as the end address is exclusive. When we re-enter the loop at the same address, the page has already been freed and the pte_present test will fail, meaning that we do not reconsider the address for invalidation. This patch fixes the problem by incrementing addr by the PAGE_SIZE before breaking out of the loop on batch failure. Signed-off-by: Will Deacon Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 1cc6bfbd872e..3e503831e042 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1147,6 +1147,7 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(!__tlb_remove_page(tlb, page))) { force_flush = 1; + addr += PAGE_SIZE; break; } continue; -- cgit v1.2.3 From 401507d67d5c2854f5a88b3f93f64fc6f267bca5 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 29 Oct 2014 14:50:18 -0700 Subject: cgroup/kmemleak: add kmemleak_free() for cgroup deallocations. Commit ff7ee93f4715 ("cgroup/kmemleak: Annotate alloc_page() for cgroup allocations") introduces kmemleak_alloc() for alloc_page_cgroup(), but corresponding kmemleak_free() is missing, which makes kmemleak be wrongly disabled after memory offlining. Log is pasted at the end of this commit message. This patch add kmemleak_free() into free_page_cgroup(). During page offlining, this patch removes corresponding entries in kmemleak rbtree. After that, the freed memory can be allocated again by other subsystems without killing kmemleak. bash # for x in 1 2 3 4; do echo offline > /sys/devices/system/memory/memory$x/state ; sleep 1; done ; dmesg | grep leak Offlined Pages 32768 kmemleak: Cannot insert 0xffff880016969000 into the object search tree (overlaps existing) CPU: 0 PID: 412 Comm: sleep Not tainted 3.17.0-rc5+ #86 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x46/0x58 create_object+0x266/0x2c0 kmemleak_alloc+0x26/0x50 kmem_cache_alloc+0xd3/0x160 __sigqueue_alloc+0x49/0xd0 __send_signal+0xcb/0x410 send_signal+0x45/0x90 __group_send_sig_info+0x13/0x20 do_notify_parent+0x1bb/0x260 do_exit+0x767/0xa40 do_group_exit+0x44/0xa0 SyS_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b kmemleak: Kernel memory leak detector disabled kmemleak: Object 0xffff880016900000 (size 524288): kmemleak: comm "swapper/0", pid 0, jiffies 4294667296 kmemleak: min_count = 0 kmemleak: count = 0 kmemleak: flags = 0x1 kmemleak: checksum = 0 kmemleak: backtrace: log_early+0x63/0x77 kmemleak_alloc+0x4b/0x50 init_section_page_cgroup+0x7f/0xf5 page_cgroup_init+0xc5/0xd0 start_kernel+0x333/0x408 x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0xf5/0xfc Fixes: ff7ee93f4715 (cgroup/kmemleak: Annotate alloc_page() for cgroup allocations) Signed-off-by: Wang Nan Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Steven Rostedt Cc: [3.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3708264d2833..5331c2bd85a2 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -171,6 +171,7 @@ static void free_page_cgroup(void *addr) sizeof(struct page_cgroup) * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); + kmemleak_free(addr); free_pages_exact(addr, table_size); } } -- cgit v1.2.3 From 6ea41c0c0aa37d87ef5dd0d14535d2e1e195cd83 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 29 Oct 2014 14:50:20 -0700 Subject: mm/compaction.c: avoid premature range skip in isolate_migratepages_range Commit edc2ca612496 ("mm, compaction: move pageblock checks up from isolate_migratepages_range()") commonizes isolate_migratepages variants and make them use isolate_migratepages_block(). isolate_migratepages_block() could stop the execution when enough pages are isolated, but, there is no code in isolate_migratepages_range() to handle this case. In the result, even if isolate_migratepages_block() returns prematurely without checking all pages in the range, isolate_migratepages_block() is called repeately on the following pageblock and some pages in the previous range are skipped to check. Then, CMA is failed frequently due to this fact. To fix this problem, this patch let isolate_migratepages_range() know the situation that enough pages are isolated and stop the isolation in that case. Note that isolate_migratepages() has no such problem, because, it always stops the isolation after just one call of isolate_migratepages_block(). Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Minchan Kim Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: Mel Gorman Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index edba18aed173..ec74cf0123ef 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -784,6 +784,9 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, cc->nr_migratepages = 0; break; } + + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + break; } acct_isolated(cc->zone, cc); -- cgit v1.2.3 From 5ddacbe92b806cd5b4f8f154e8e46ac267fff55c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 29 Oct 2014 14:50:26 -0700 Subject: mm: free compound page with correct order Compound page should be freed by put_page() or free_pages() with correct order. Not doing so will cause tail pages leaked. The compound order can be obtained by compound_order() or use HPAGE_PMD_ORDER in our case. Some people would argue the latter is faster but I prefer the former which is more general. This bug was observed not just on our servers (the worst case we saw is 11G leaked on a 48G machine) but also on our workstations running Ubuntu based distro. $ cat /proc/vmstat | grep thp_zero_page_alloc thp_zero_page_alloc 55 thp_zero_page_alloc_failed 0 This means there is (thp_zero_page_alloc - 1) * (2M - 4K) memory leaked. Fixes: 97ae17497e99 ("thp: implement refcounting for huge zero page") Signed-off-by: Yu Zhao Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Mel Gorman Cc: David Rientjes Cc: Bob Liu Cc: [3.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 74c78aa8bc2f..780d12c000e9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -200,7 +200,7 @@ retry: preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); - __free_page(zero_page); + __free_pages(zero_page, compound_order(zero_page)); goto retry; } @@ -232,7 +232,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct page *zero_page = xchg(&huge_zero_page, NULL); BUG_ON(zero_page == NULL); - __free_page(zero_page); + __free_pages(zero_page, compound_order(zero_page)); return HPAGE_PMD_NR; } -- cgit v1.2.3 From 6d50e60cd2edb5a57154db5a6f64eef5aa59b751 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 29 Oct 2014 14:50:31 -0700 Subject: mm, thp: fix collapsing of hugepages on madvise If an anonymous mapping is not allowed to fault thp memory and then madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never collapse this memory into thp memory. This occurs because the madvise(2) handler for thp, hugepage_madvise(), clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags until the final action of madvise_behavior(). This causes the khugepaged_enter_vma_merge() to be a no-op in hugepage_madvise() when the vma had previously had VM_NOHUGEPAGE set. Fix this by passing the correct vma flags to the khugepaged mm slot handler. There's no chance khugepaged can run on this vma until after madvise_behavior() returns since we hold mm->mmap_sem. It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags in hugepage_advise(), but I didn't want to introduce special case behavior into madvise_behavior(). I think it's best to just let it always set vma->vm_flags itself. Signed-off-by: David Rientjes Reported-by: Suleiman Souhlal Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 ++++++----- mm/mmap.c | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 780d12c000e9..de984159cf0b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { @@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (unlikely(khugepaged_enter_vma_merge(vma))) + if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) return -ENOMEM; break; case MADV_NOHUGEPAGE: @@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm) return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { unsigned long hstart, hend; if (!vma->anon_vma) @@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); + VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma); + return khugepaged_enter(vma, vm_flags); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 7f855206e7fb..87e82b38453c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(prev); + khugepaged_enter_vma_merge(prev, vm_flags); return prev; } @@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(area); + khugepaged_enter_vma_merge(area, vm_flags); return area; } @@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } @@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } -- cgit v1.2.3 From 35dca71c1fad13616d9ea336c05730071793b63a Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 29 Oct 2014 14:50:40 -0700 Subject: memory-hotplug: clear pgdat which is allocated by bootmem in try_offline_node() When hot adding the same memory after hot removal, the following messages are shown: WARNING: CPU: 20 PID: 6 at mm/page_alloc.c:4968 free_area_init_node+0x3fe/0x426() ... Call Trace: dump_stack+0x46/0x58 warn_slowpath_common+0x81/0xa0 warn_slowpath_null+0x1a/0x20 free_area_init_node+0x3fe/0x426 hotadd_new_pgdat+0x90/0x110 add_memory+0xd4/0x200 acpi_memory_device_add+0x1aa/0x289 acpi_bus_attach+0xfd/0x204 acpi_bus_attach+0x178/0x204 acpi_bus_scan+0x6a/0x90 acpi_device_hotplug+0xe8/0x418 acpi_hotplug_work_fn+0x1f/0x2b process_one_work+0x14e/0x3f0 worker_thread+0x11b/0x510 kthread+0xe1/0x100 ret_from_fork+0x7c/0xb0 The detaled explanation is as follows: When hot removing memory, pgdat is set to 0 in try_offline_node(). But if the pgdat is allocated by bootmem allocator, the clearing step is skipped. And when hot adding the same memory, the uninitialized pgdat is reused. But free_area_init_node() checks wether pgdat is set to zero. As a result, free_area_init_node() hits WARN_ON(). This patch clears pgdat which is allocated by bootmem allocator in try_offline_node(). Signed-off-by: Yasuaki Ishimatsu Cc: Zhang Zhen Cc: Wang Nan Cc: Tang Chen Reviewed-by: Toshi Kani Cc: Dave Hansen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 29d8693d0c61..252e1dbbed86 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1912,7 +1912,6 @@ void try_offline_node(int nid) unsigned long start_pfn = pgdat->node_start_pfn; unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; unsigned long pfn; - struct page *pgdat_page = virt_to_page(pgdat); int i; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { @@ -1941,10 +1940,6 @@ void try_offline_node(int nid) node_set_offline(nid); unregister_one_node(nid); - if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) - /* node data is allocated from boot memory */ - return; - /* free waittable in each zone */ for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; -- cgit v1.2.3 From 3a3c02ecf7f2852f122d6d16fb9b3d9cb0c6f201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:46 -0700 Subject: mm: page-writeback: inline account_page_dirtied() into single caller A follow-up patch would have changed the call signature. To save the trouble, just fold it instead. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [3.17.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff24c9d83112..ff6a5b07211e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2115,23 +2115,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); -/* - * Helper function for set_page_writeback family. - * - * The caller must hold mem_cgroup_begin/end_update_page_stat() lock - * while calling this function. - * See test_set_page_writeback for example. - * - * NOTE: Unlike account_page_dirtied this does not rely on being atomic - * wrt interrupts. - */ -void account_page_writeback(struct page *page) -{ - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - inc_zone_page_state(page, NR_WRITEBACK); -} -EXPORT_SYMBOL(account_page_writeback); - /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -2410,8 +2393,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } else { ret = TestSetPageWriteback(page); } - if (!ret) - account_page_writeback(page); + if (!ret) { + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + inc_zone_page_state(page, NR_WRITEBACK); + } mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; -- cgit v1.2.3 From d7365e783edb858279be1d03f61bc8d5d3383d90 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:48 -0700 Subject: mm: memcontrol: fix missed end-writeback page accounting Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page migration to uncharge the old page right away. The page is locked, unmapped, truncated, and off the LRU, but it could race with writeback ending, which then doesn't unaccount the page properly: test_clear_page_writeback() migration wait_on_page_writeback() TestClearPageWriteback() mem_cgroup_migrate() clear PCG_USED mem_cgroup_update_page_stat() if (PageCgroupUsed(pc)) decrease memcg pages under writeback release pc->mem_cgroup->move_lock The per-page statistics interface is heavily optimized to avoid a function call and a lookup_page_cgroup() in the file unmap fast path, which means it doesn't verify whether a page is still charged before clearing PageWriteback() and it has to do it in the stat update later. Rework it so that it looks up the page's memcg once at the beginning of the transaction and then uses it throughout. The charge will be verified before clearing PageWriteback() and migration can't uncharge the page as long as that is still set. The RCU lock will protect the memcg past uncharge. As far as losing the optimization goes, the following test results are from a microbenchmark that maps, faults, and unmaps a 4GB sparse file three times in a nested fashion, so that there are two negative passes that don't account but still go through the new transaction overhead. There is no actual difference: old: 33.195102545 seconds time elapsed ( +- 0.01% ) new: 33.199231369 seconds time elapsed ( +- 0.03% ) The time spent in page_remove_rmap()'s callees still adds up to the same, but the time spent in the function itself seems reduced: # Children Self Command Shared Object Symbol old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [3.17.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 105 ++++++++++++++++++++++++++++------------------------ mm/page-writeback.c | 22 ++++++----- mm/rmap.c | 20 +++++----- 3 files changed, 79 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 23976fd885fd..d6ac0e33e150 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) * start move here. */ -/* for quick checking without looking up memcg */ -atomic_t memcg_moving __read_mostly; - static void mem_cgroup_start_move(struct mem_cgroup *memcg) { - atomic_inc(&memcg_moving); atomic_inc(&memcg->moving_account); synchronize_rcu(); } @@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * Now, mem_cgroup_clear_mc() may call this function with NULL. * We check NULL in callee rather than caller. */ - if (memcg) { - atomic_dec(&memcg_moving); + if (memcg) atomic_dec(&memcg->moving_account); - } } /* @@ -2204,41 +2198,52 @@ cleanup: return true; } -/* - * Used to update mapped file or writeback or other statistics. +/** + * mem_cgroup_begin_page_stat - begin a page state statistics transaction + * @page: page that is going to change accounted state + * @locked: &memcg->move_lock slowpath was taken + * @flags: IRQ-state flags for &memcg->move_lock * - * Notes: Race condition + * This function must mark the beginning of an accounted page state + * change to prevent double accounting when the page is concurrently + * being moved to another memcg: * - * Charging occurs during page instantiation, while the page is - * unmapped and locked in page migration, or while the page table is - * locked in THP migration. No race is possible. + * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * mem_cgroup_end_page_stat(memcg, locked, flags); * - * Uncharge happens to pages with zero references, no race possible. + * The RCU lock is held throughout the transaction. The fast path can + * get away without acquiring the memcg->move_lock (@locked is false) + * because page moving starts with an RCU grace period. * - * Charge moving between groups is protected by checking mm->moving - * account and taking the move_lock in the slowpath. + * The RCU lock also protects the memcg from being freed when the page + * state that is going to change is the only thing preventing the page + * from being uncharged. E.g. end-writeback clearing PageWriteback(), + * which allows migration to go ahead and uncharge the page before the + * account transaction might be complete. */ - -void __mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, + bool *locked, + unsigned long *flags) { struct mem_cgroup *memcg; struct page_cgroup *pc; + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return NULL; + pc = lookup_page_cgroup(page); again: memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - /* - * If this memory cgroup is not under account moving, we don't - * need to take move_lock_mem_cgroup(). Because we already hold - * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock(). - */ - VM_BUG_ON(!rcu_read_lock_held()); + return NULL; + + *locked = false; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; move_lock_mem_cgroup(memcg, flags); if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { @@ -2246,36 +2251,40 @@ again: goto again; } *locked = true; + + return memcg; } -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +/** + * mem_cgroup_end_page_stat - finish a page state statistics transaction + * @memcg: the memcg that was accounted against + * @locked: value received from mem_cgroup_begin_page_stat() + * @flags: value received from mem_cgroup_begin_page_stat() + */ +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags) { - struct page_cgroup *pc = lookup_page_cgroup(page); + if (memcg && locked) + move_unlock_mem_cgroup(memcg, &flags); - /* - * It's guaranteed that pc->mem_cgroup never changes while - * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_mem_cgroup(). - */ - move_unlock_mem_cgroup(pc->mem_cgroup, flags); + rcu_read_unlock(); } -void mem_cgroup_update_page_stat(struct page *page, +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val) { - struct mem_cgroup *memcg; - struct page_cgroup *pc = lookup_page_cgroup(page); - unsigned long uninitialized_var(flags); - - if (mem_cgroup_disabled()) - return; - VM_BUG_ON(!rcu_read_lock_held()); - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - this_cpu_add(memcg->stat->count[idx], val); + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff6a5b07211e..19ceae87522d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 116a5053415b..f574046f77d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - bool locked; + struct mem_cgroup *memcg; unsigned long flags; + bool locked; - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); } /** @@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct mem_cgroup *uninitialized_var(memcg); bool anon = PageAnon(page); - bool locked; unsigned long flags; + bool locked; /* * The anon case has no mem_cgroup page_stat to update; but may @@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page) * we hold the lock against page_stat move: so avoid it on anon. */ if (!anon) - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) @@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page) -hpage_nr_pages(page)); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - return; out: if (!anon) - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); } /* -- cgit v1.2.3 From 8186eb6a799e4e32f984b55858d8e393938be0c1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:51 -0700 Subject: mm: rmap: split out page_remove_file_rmap() page_remove_rmap() has too many branches on PageAnon() and is hard to follow. Move the file part into a separate function. Signed-off-by: Johannes Weiner Reviewed-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 78 +++++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index f574046f77d4..19886fb2f13a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1054,6 +1054,36 @@ void page_add_file_rmap(struct page *page) mem_cgroup_end_page_stat(memcg, locked, flags); } +static void page_remove_file_rmap(struct page *page) +{ + struct mem_cgroup *memcg; + unsigned long flags; + bool locked; + + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + goto out; + + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) + goto out; + + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. + */ + __dec_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); +out: + mem_cgroup_end_page_stat(memcg, locked, flags); +} + /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -1062,46 +1092,33 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { - struct mem_cgroup *uninitialized_var(memcg); - bool anon = PageAnon(page); - unsigned long flags; - bool locked; - - /* - * The anon case has no mem_cgroup page_stat to update; but may - * uncharge_page() below, where the lock ordering can deadlock if - * we hold the lock against page_stat move: so avoid it on anon. - */ - if (!anon) - memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + if (!PageAnon(page)) { + page_remove_file_rmap(page); + return; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; /* - * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED - * and not charged by memcg for now. - * * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and - * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (unlikely(PageHuge(page))) - goto out; - if (anon) { - if (PageTransHuge(page)) - __dec_zone_page_state(page, - NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - -hpage_nr_pages(page)); - } else { - __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); - } + if (PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + -hpage_nr_pages(page)); + if (unlikely(PageMlocked(page))) clear_page_mlock(page); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1111,9 +1128,6 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ -out: - if (!anon) - mem_cgroup_end_page_stat(memcg, locked, flags); } /* -- cgit v1.2.3 From 8aba7e0a2c02355f9a7dec629635cb7093fe0508 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 29 Oct 2014 14:50:55 -0700 Subject: mm/slab_common: don't check for duplicate cache names The SLUB cache merges caches with the same size and alignment and there was long standing bug with this behavior: - create the cache named "foo" - create the cache named "bar" (which is merged with "foo") - delete the cache named "foo" (but it stays allocated because "bar" uses it) - create the cache named "foo" again - it fails because the name "foo" is already used That bug was fixed in commit 694617474e33 ("slab_common: fix the check for duplicate slab names") by not warning on duplicate cache names when the SLUB subsystem is used. Recently, cache merging was implemented the with SLAB subsystem too, in 12220dea07f1 ("mm/slab: support slab merge")). Therefore we need stop checking for duplicate names even for the SLAB subsystem. This patch fixes the bug by removing the check. Signed-off-by: Mikulas Patocka Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 3a6e0cfdf03a..406944207b61 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -93,16 +93,6 @@ static int kmem_cache_sanity_check(const char *name, size_t size) s->object_size); continue; } - -#if !defined(CONFIG_SLUB) - if (!strcmp(s->name, name)) { - pr_err("%s (%s): Cache name already exists.\n", - __func__, name); - dump_stack(); - s = NULL; - return -EINVAL; - } -#endif } WARN_ON(strchr(name, ' ')); /* It confuses parsers */ -- cgit v1.2.3 From 4d88e6f7d5ffc84e6094a47925870f4a130555c2 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 29 Oct 2014 14:51:02 -0700 Subject: mm/balloon_compaction: fix deflation when compaction is disabled If CONFIG_BALLOON_COMPACTION=n balloon_page_insert() does not link pages with balloon and doesn't set PagePrivate flag, as a result balloon_page_dequeue() cannot get any pages because it thinks that all of them are isolated. Without balloon compaction nobody can isolate ballooned pages. It's safe to remove this check. Fixes: d6d86c0a7f8d ("mm/balloon_compaction: redesign ballooned pages management"). Signed-off-by: Konstantin Khlebnikov Reported-by: Matt Mullins Cc: [3.17] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/balloon_compaction.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index b3cbe19f71b5..fcad8322ef36 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -68,11 +68,13 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) * to be released by the balloon driver. */ if (trylock_page(page)) { +#ifdef CONFIG_BALLOON_COMPACTION if (!PagePrivate(page)) { /* raced with isolation */ unlock_page(page); continue; } +#endif spin_lock_irqsave(&b_dev_info->pages_lock, flags); balloon_page_delete(page); __count_vm_event(BALLOON_DEFLATE); -- cgit v1.2.3