From 8698a745d800c59cd5a576398bdeccd578ac66f1 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 11 Mar 2014 18:09:12 +0800 Subject: sched, treewide: Replace hardcoded nice values with MIN_NICE/MAX_NICE Replace various -20/+19 hardcoded nice values with MIN_NICE/MAX_NICE. Signed-off-by: Dongsheng Yang Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/ff13819fd09b7a5dba5ab5ae797f2e7019bdfa17.1394532288.git.yangds.fnst@cn.fujitsu.com Cc: devel@driverdev.osuosl.org Cc: devicetree@vger.kernel.org Cc: fcoe-devel@open-fcoe.org Cc: linux390@de.ibm.com Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: nbd-general@lists.sourceforge.net Cc: ocfs2-devel@oss.oracle.com Cc: openipmi-developer@lists.sourceforge.net Cc: qla2xxx-upstream@qlogic.com Cc: linux-arch@vger.kernel.org [ Consolidated the patches, twiddled the changelog. ] Signed-off-by: Ingo Molnar --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1546655a2d78..dcdb6f9adea1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2803,7 +2803,7 @@ static int khugepaged(void *none) struct mm_slot *mm_slot; set_freezable(); - set_user_nice(current, 19); + set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { khugepaged_do_scan(); -- cgit v1.2.3 From 4e857c58efeb99393cba5a5d0d8ec7117183137c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Mar 2014 18:06:10 +0100 Subject: arch: Mass conversion of smp_mb__*() Mostly scripted conversion of the smp_mb__* barriers. Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/n/tip-55dhyhocezdw1dg7u19hmh1u@git.kernel.org Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Signed-off-by: Ingo Molnar --- mm/backing-dev.c | 2 +- mm/filemap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 09d9591b7708..1706cbbdf5f0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -557,7 +557,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) bit = sync ? BDI_sync_congested : BDI_async_congested; if (test_and_clear_bit(bit, &bdi->state)) atomic_dec(&nr_bdi_congested[sync]); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } diff --git a/mm/filemap.c b/mm/filemap.c index a82fbe4c9e8e..c73535c914cc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -740,7 +740,7 @@ void unlock_page(struct page *page) { VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -757,7 +757,7 @@ void end_page_writeback(struct page *page) if (!test_clear_page_writeback(page)) BUG(); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_page(page, PG_writeback); } EXPORT_SYMBOL(end_page_writeback); -- cgit v1.2.3 From 5835f25117ef6a56144bfc6be98b5a3cb188bf7a Mon Sep 17 00:00:00 2001 From: Hiroshige Sato Date: Wed, 16 Apr 2014 21:28:34 +0900 Subject: mm: Fix printk typo in dmapool.c Fix printk typo in dmapool.c Signed-off-by: Hiroshige Sato Signed-off-by: Jiri Kosina --- mm/dmapool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index c69781e97cf9..8058fcd7ae91 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -341,10 +341,10 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, continue; if (pool->dev) dev_err(pool->dev, - "dma_pool_alloc %s, %p (corruped)\n", + "dma_pool_alloc %s, %p (corrupted)\n", pool->name, retval); else - pr_err("dma_pool_alloc %s, %p (corruped)\n", + pr_err("dma_pool_alloc %s, %p (corrupted)\n", pool->name, retval); /* -- cgit v1.2.3 From 107437febd495a50e2cd09c81bbaa84d30e57b07 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 29 Apr 2014 15:36:15 -0400 Subject: mm/numa: Remove BUG_ON() in __handle_mm_fault() Changing PTEs and PMDs to pte_numa & pmd_numa is done with the mmap_sem held for reading, which means a pmd can be instantiated and turned into a numa one while __handle_mm_fault() is examining the value of old_pmd. If that happens, __handle_mm_fault() should just return and let the page fault retry, instead of throwing an oops. This is handled by the test for pmd_trans_huge(*pmd) below. Signed-off-by: Rik van Riel Reviewed-by: Naoya Horiguchi Reported-by: Sunil Pandey Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Mel Gorman Cc: linux-mm@kvack.org Cc: lwoodman@redhat.com Cc: dave.hansen@intel.com Link: http://lkml.kernel.org/r/20140429153615.2d72098e@annuminas.surriel.com Signed-off-by: Ingo Molnar --- mm/memory.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d0f0bef3be48..9c2dc659f6f6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3900,9 +3900,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } } - /* THP should already have been handled */ - BUG_ON(pmd_numa(*pmd)); - /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could -- cgit v1.2.3 From 719c555f4424b194905aa3512a754c0444f27ce8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 May 2014 20:01:52 -0600 Subject: block: move mm/bounce.c to block/ Continue moving some of the block files that are scattered around. bounce.c contains only code for bouncing the contents of a bio. It's block proper code, not mm code. Suggested-by: Ming Lei Signed-off-by: Jens Axboe --- mm/Makefile | 1 - mm/bounce.c | 287 ------------------------------------------------------------ 2 files changed, 288 deletions(-) delete mode 100644 mm/bounce.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index b484452dac57..0173940407f6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -30,7 +30,6 @@ endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o diff --git a/mm/bounce.c b/mm/bounce.c deleted file mode 100644 index 523918b8c6dc..000000000000 --- a/mm/bounce.c +++ /dev/null @@ -1,287 +0,0 @@ -/* bounce buffer handling for block devices - * - * - Split from highmem.c - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define POOL_SIZE 64 -#define ISA_POOL_SIZE 16 - -static mempool_t *page_pool, *isa_page_pool; - -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL) -static __init int init_emergency_pool(void) -{ -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) - if (max_pfn <= max_low_pfn) - return 0; -#endif - - page_pool = mempool_create_page_pool(POOL_SIZE, 0); - BUG_ON(!page_pool); - printk("bounce pool size: %d pages\n", POOL_SIZE); - - return 0; -} - -__initcall(init_emergency_pool); -#endif - -#ifdef CONFIG_HIGHMEM -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned long flags; - unsigned char *vto; - - local_irq_save(flags); - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); - local_irq_restore(flags); -} - -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - -#endif /* CONFIG_HIGHMEM */ - -/* - * allocate pages in the DMA region for the ISA pool - */ -static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) -{ - return mempool_alloc_pages(gfp_mask | GFP_DMA, data); -} - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - if (isa_page_pool) - return 0; - - isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(!isa_page_pool); - - printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); - return 0; -} - -/* - * Simple bounce buffer support for highmem pages. Depending on the - * queue gfp mask set, *to may or may not be a highmem page. kmap it - * always, it will do the Right Thing - */ -static void copy_to_high_bio_irq(struct bio *to, struct bio *from) -{ - unsigned char *vfrom; - struct bio_vec tovec, *fromvec = from->bi_io_vec; - struct bvec_iter iter; - - bio_for_each_segment(tovec, to, iter) { - if (tovec.bv_page != fromvec->bv_page) { - /* - * fromvec->bv_offset and fromvec->bv_len might have - * been modified by the block layer, so use the original - * copy, bounce_copy_vec already uses tovec->bv_len - */ - vfrom = page_address(fromvec->bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); - } - - fromvec++; - } -} - -static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, *org_vec; - int i; - - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); - - /* - * free up bounce indirect pages used - */ - bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i; - if (bvec->bv_page == org_vec->bv_page) - continue; - - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); - } - - bio_endio(bio_orig, err); - bio_put(bio); -} - -static void bounce_end_io_write(struct bio *bio, int err) -{ - bounce_end_io(bio, page_pool, err); -} - -static void bounce_end_io_write_isa(struct bio *bio, int err) -{ - - bounce_end_io(bio, isa_page_pool, err); -} - -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - copy_to_high_bio_irq(bio_orig, bio); - - bounce_end_io(bio, pool, err); -} - -static void bounce_end_io_read(struct bio *bio, int err) -{ - __bounce_end_io_read(bio, page_pool, err); -} - -static void bounce_end_io_read_isa(struct bio *bio, int err) -{ - __bounce_end_io_read(bio, isa_page_pool, err); -} - -#ifdef CONFIG_NEED_BOUNCE_POOL -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - if (bio_data_dir(bio) != WRITE) - return 0; - - if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) - return 0; - - return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); -} -#else -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - return 0; -} -#endif /* CONFIG_NEED_BOUNCE_POOL */ - -static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool, int force) -{ - struct bio *bio; - int rw = bio_data_dir(*bio_orig); - struct bio_vec *to, from; - struct bvec_iter iter; - unsigned i; - - if (force) - goto bounce; - bio_for_each_segment(from, *bio_orig, iter) - if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) - goto bounce; - - return; -bounce: - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); - - bio_for_each_segment_all(to, bio, i) { - struct page *page = to->bv_page; - - if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) - continue; - - inc_zone_page_state(to->bv_page, NR_BOUNCE); - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - - if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); - } - } - - trace_block_bio_bounce(q, *bio_orig); - - bio->bi_flags |= (1 << BIO_BOUNCED); - - if (pool == page_pool) { - bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } - - bio->bi_private = *bio_orig; - *bio_orig = bio; -} - -void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) -{ - int must_bounce; - mempool_t *pool; - - /* - * Data-less bio, nothing to bounce - */ - if (!bio_has_data(*bio_orig)) - return; - - must_bounce = must_snapshot_stable_pages(q, *bio_orig); - - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) - return; - pool = page_pool; - } else { - BUG_ON(!isa_page_pool); - pool = isa_page_pool; - } - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool, must_bounce); -} - -EXPORT_SYMBOL(blk_queue_bounce); -- cgit v1.2.3 From f1af9d3af308145478749194346f11efad1134b2 Mon Sep 17 00:00:00 2001 From: Philipp Hachtmann Date: Wed, 29 Jan 2014 18:16:01 +0100 Subject: mm/memblock: Do some refactoring, enhance API Refactor the memblock code and extend the memblock API to make it more flexible. With the extended API it is simple to define and work with additional memory lists. The static functions memblock_add_region and __memblock_remove are renamed to memblock_add_range and meblock_remove_range and added to the memblock API. The __next_free_mem_range and __next_free_mem_range_rev functions are replaced with calls to the more generic list walkers __next_mem_range and __next_mem_range_rev. To walk an arbitrary memory list two new macros for_each_mem_range and for_each_mem_range_rev are added. These new macros are used to define for_each_free_mem_range and for_each_free_mem_range_reverse. Signed-off-by: Philipp Hachtmann Signed-off-by: Martin Schwidefsky --- mm/memblock.c | 193 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 122 insertions(+), 71 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index e9d6ca9a01a9..9edd0928daab 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -472,7 +472,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, } /** - * memblock_add_region - add new memblock region + * memblock_add_range - add new memblock region * @type: memblock type to add new region into * @base: base address of the new region * @size: size of the new region @@ -487,7 +487,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * RETURNS: * 0 on success, -errno on failure. */ -static int __init_memblock memblock_add_region(struct memblock_type *type, +int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int nid, unsigned long flags) { @@ -569,12 +569,12 @@ repeat: int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, int nid) { - return memblock_add_region(&memblock.memory, base, size, nid, 0); + return memblock_add_range(&memblock.memory, base, size, nid, 0); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { - return memblock_add_region(&memblock.memory, base, size, + return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } @@ -654,8 +654,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, return 0; } -static int __init_memblock __memblock_remove(struct memblock_type *type, - phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_remove_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { int start_rgn, end_rgn; int i, ret; @@ -671,9 +671,10 @@ static int __init_memblock __memblock_remove(struct memblock_type *type, int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { - return __memblock_remove(&memblock.memory, base, size); + return memblock_remove_range(&memblock.memory, base, size); } + int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", @@ -681,7 +682,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) (unsigned long long)base + size - 1, (void *)_RET_IP_); - return __memblock_remove(&memblock.reserved, base, size); + return memblock_remove_range(&memblock.reserved, base, size); } static int __init_memblock memblock_reserve_region(phys_addr_t base, @@ -696,7 +697,7 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_region(_rgn, base, size, nid, flags); + return memblock_add_range(_rgn, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) @@ -758,17 +759,19 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) } /** - * __next_free_mem_range - next function for for_each_free_mem_range() + * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * - * Find the first free area from *@idx which matches @nid, fill the out + * Find the first area from *@idx which matches @nid, fill the out * parameters, and update *@idx for the next iteration. The lower 32bit of - * *@idx contains index into memory region and the upper 32bit indexes the - * areas before each reserved region. For example, if reserved regions + * *@idx contains index into type_a and the upper 32bit indexes the + * areas before each region in type_b. For example, if type_b regions * look like the following, * * 0:[0-16), 1:[32-48), 2:[128-130) @@ -780,53 +783,77 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_free_mem_range(u64 *idx, int nid, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +void __init_memblock __next_mem_range(u64 *idx, int nid, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - struct memblock_type *mem = &memblock.memory; - struct memblock_type *rsv = &memblock.reserved; - int mi = *idx & 0xffffffff; - int ri = *idx >> 32; + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + if (WARN_ONCE(nid == MAX_NUMNODES, + "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; - for ( ; mi < mem->cnt; mi++) { - struct memblock_region *m = &mem->regions[mi]; + for (; idx_a < type_a->cnt; idx_a++) { + struct memblock_region *m = &type_a->regions[idx_a]; + phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != m_nid) continue; - /* scan areas before each reservation for intersection */ - for ( ; ri < rsv->cnt + 1; ri++) { - struct memblock_region *r = &rsv->regions[ri]; - phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; - phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b < type_b->cnt + 1; idx_b++) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : ULLONG_MAX; - /* if ri advanced past mi, break out to advance mi */ + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ if (r_start >= m_end) break; /* if the two regions intersect, we're done */ if (m_start < r_end) { if (out_start) - *out_start = max(m_start, r_start); + *out_start = + max(m_start, r_start); if (out_end) *out_end = min(m_end, r_end); if (out_nid) - *out_nid = memblock_get_region_node(m); + *out_nid = m_nid; /* - * The region which ends first is advanced - * for the next iteration. + * The region which ends first is + * advanced for the next iteration. */ if (m_end <= r_end) - mi++; + idx_a++; else - ri++; - *idx = (u32)mi | (u64)ri << 32; + idx_b++; + *idx = (u32)idx_a | (u64)idx_b << 32; return; } } @@ -837,57 +864,80 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, } /** - * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() + * __next_mem_range_rev - generic next function for for_each_*_range_rev() + * + * Finds the next range from type_a which is not marked as unsuitable + * in type_b. + * * @idx: pointer to u64 loop variable * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL * - * Reverse of __next_free_mem_range(). - * - * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't - * be able to hot-remove hotpluggable memory used by the kernel. So this - * function skip hotpluggable regions if needed when allocating memory for the - * kernel. + * Reverse of __next_mem_range(). */ -void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, - phys_addr_t *out_start, - phys_addr_t *out_end, int *out_nid) +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - struct memblock_type *mem = &memblock.memory; - struct memblock_type *rsv = &memblock.reserved; - int mi = *idx & 0xffffffff; - int ri = *idx >> 32; + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; if (*idx == (u64)ULLONG_MAX) { - mi = mem->cnt - 1; - ri = rsv->cnt; + idx_a = type_a->cnt - 1; + idx_b = type_b->cnt; } - for ( ; mi >= 0; mi--) { - struct memblock_region *m = &mem->regions[mi]; + for (; idx_a >= 0; idx_a--) { + struct memblock_region *m = &type_a->regions[idx_a]; + phys_addr_t m_start = m->base; phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) + if (nid != NUMA_NO_NODE && nid != m_nid) continue; /* skip hotpluggable memory regions if needed */ if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; - /* scan areas before each reservation for intersection */ - for ( ; ri >= 0; ri--) { - struct memblock_region *r = &rsv->regions[ri]; - phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; - phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b >= 0; idx_b--) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : ULLONG_MAX; + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ - /* if ri advanced past mi, break out to advance mi */ if (r_end <= m_start) break; /* if the two regions intersect, we're done */ @@ -897,18 +947,17 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, if (out_end) *out_end = min(m_end, r_end); if (out_nid) - *out_nid = memblock_get_region_node(m); - + *out_nid = m_nid; if (m_start >= r_start) - mi--; + idx_a--; else - ri--; - *idx = (u32)mi | (u64)ri << 32; + idx_b--; + *idx = (u32)idx_a | (u64)idx_b << 32; return; } } } - + /* signal end of iteration */ *idx = ULLONG_MAX; } @@ -1201,7 +1250,7 @@ void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) __func__, (u64)base, (u64)base + size - 1, (void *)_RET_IP_); kmemleak_free_part(__va(base), size); - __memblock_remove(&memblock.reserved, base, size); + memblock_remove_range(&memblock.reserved, base, size); } /* @@ -1287,8 +1336,10 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) } /* truncate both memory and reserved regions */ - __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); - __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); + memblock_remove_range(&memblock.memory, max_addr, + (phys_addr_t)ULLONG_MAX); + memblock_remove_range(&memblock.reserved, max_addr, + (phys_addr_t)ULLONG_MAX); } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) -- cgit v1.2.3 From 70210ed950b538ee7eb811dccc402db9df1c9be4 Mon Sep 17 00:00:00 2001 From: Philipp Hachtmann Date: Wed, 29 Jan 2014 18:16:01 +0100 Subject: mm/memblock: add physical memory list Add the physmem list to the memblock structure. This list only exists if HAVE_MEMBLOCK_PHYS_MAP is selected and contains the unmodified list of physically available memory. It differs from the memblock memory list as it always contains all memory ranges even if the memory has been restricted, e.g. by use of the mem= kernel parameter. Signed-off-by: Philipp Hachtmann Signed-off-by: Martin Schwidefsky --- mm/Kconfig | 3 +++ mm/memblock.c | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 1b5a95f0fa01..28cec518f4d4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -134,6 +134,9 @@ config HAVE_MEMBLOCK config HAVE_MEMBLOCK_NODE_MAP boolean +config HAVE_MEMBLOCK_PHYS_MAP + boolean + config ARCH_DISCARD_MEMBLOCK boolean diff --git a/mm/memblock.c b/mm/memblock.c index 9edd0928daab..a810ba923cdd 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -27,6 +27,9 @@ static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; +#endif struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, @@ -37,6 +40,12 @@ struct memblock memblock __initdata_memblock = { .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + .physmem.regions = memblock_physmem_init_regions, + .physmem.cnt = 1, /* empty dummy entry */ + .physmem.max = INIT_PHYSMEM_REGIONS, +#endif + .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; @@ -1553,6 +1562,9 @@ static int __init memblock_init_debugfs(void) return -ENXIO; debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops); +#endif return 0; } -- cgit v1.2.3 From a62c34bd2a8a3f159945becd57401e478818d51c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 May 2014 15:58:33 -0700 Subject: x86, mm: Improve _install_special_mapping and fix x86 vdso naming Using arch_vma_name to give special mappings a name is awkward. x86 currently implements it by comparing the start address of the vma to the expected address of the vdso. This requires tracking the start address of special mappings and is probably buggy if a special vma is split or moved. Improve _install_special_mapping to just name the vma directly. Use it to give the x86 vvar area a name, which should make CRIU's life easier. As a side effect, the vvar area will show up in core dumps. This could be considered weird and is fixable. [hpa: I say we accept this as-is but be prepared to deal with knocking out the vvars from core dumps if this becomes a problem.] Cc: Cyrill Gorcunov Cc: Pavel Emelyanov Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/276b39b6b645fb11e345457b503f17b83c2c6fd0.1400538962.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- mm/mmap.c | 89 ++++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index b1202cf81f4b..52bbc9514d9d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2872,6 +2872,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) return 1; } +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; static int special_mapping_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -2887,7 +2912,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, */ pgoff = vmf->pgoff - vma->vm_pgoff; - for (pages = vma->vm_private_data; pgoff && *pages; ++pages) + if (vma->vm_ops == &legacy_special_mapping_vmops) + pages = vma->vm_private_data; + else + pages = ((struct vm_special_mapping *)vma->vm_private_data)-> + pages; + + for (; pgoff && *pages; ++pages) pgoff--; if (*pages) { @@ -2900,30 +2931,11 @@ static int special_mapping_fault(struct vm_area_struct *vma, return VM_FAULT_SIGBUS; } -/* - * Having a close hook prevents vma merging regardless of flags. - */ -static void special_mapping_close(struct vm_area_struct *vma) -{ -} - -static const struct vm_operations_struct special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - -/* - * Called with mm->mmap_sem held for writing. - * Insert a new vma covering the given region, with the given flags. - * Its pages are supplied by the given array of struct page *. - * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. - * The region past the last page supplied will always produce SIGBUS. - * The array pointer and the pages it points to are assumed to stay alive - * for as long as this mapping might exist. - */ -struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_operations_struct *ops, + void *priv) { int ret; struct vm_area_struct *vma; @@ -2940,8 +2952,8 @@ struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - vma->vm_ops = &special_mapping_vmops; - vma->vm_private_data = pages; + vma->vm_ops = ops; + vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) @@ -2958,12 +2970,31 @@ out: return ERR_PTR(ret); } +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, + &special_mapping_vmops, (void *)spec); +} + int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { - struct vm_area_struct *vma = _install_special_mapping(mm, - addr, len, vm_flags, pages); + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, &legacy_special_mapping_vmops, + (void *)pages); if (IS_ERR(vma)) return PTR_ERR(vma); -- cgit v1.2.3 From 7f39dda9d86fb4f4f17af0de170decf125726f8c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 4 Jun 2014 16:05:33 -0700 Subject: mm: fix sleeping function warning from __put_anon_vma Trinity reports BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:47 in_atomic(): 0, irqs_disabled(): 0, pid: 5787, name: trinity-c27 __might_sleep < down_write < __put_anon_vma < page_get_anon_vma < migrate_pages < compact_zone < compact_zone_order < try_to_compact_pages .. Right, since conversion to mutex then rwsem, we should not put_anon_vma() from inside an rcu_read_lock()ed section: fix the two places that did so. And add might_sleep() to anon_vma_free(), as suggested by Peter Zijlstra. Fixes: 88c22088bf23 ("mm: optimize page_lock_anon_vma() fast-path") Reported-by: Dave Jones Signed-off-by: Hugh Dickins Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 9c3e77396d1a..10aef960d3d0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ + might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); @@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page) * above cannot corrupt). */ if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; + return NULL; } out: rcu_read_unlock(); @@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; - goto out; + return NULL; } /* we pinned the anon_vma, its safe to sleep */ -- cgit v1.2.3 From c177c81e09e517bbf75b67762cdab1b83aba6976 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:05:35 -0700 Subject: hugetlb: restrict hugepage_migration_support() to x86_64 Currently hugepage migration is available for all archs which support pmd-level hugepage, but testing is done only for x86_64 and there're bugs for other archs. So to avoid breaking such archs, this patch limits the availability strictly to x86_64 until developers of other archs get interested in enabling this feature. Simply disabling hugepage migration on non-x86_64 archs is not enough to fix the reported problem where sys_move_pages() hits the BUG_ON() in follow_page(FOLL_GET), so let's fix this by checking if hugepage migration is supported in vma_migratable(). Signed-off-by: Naoya Horiguchi Reported-by: Michael Ellerman Tested-by: Michael Ellerman Acked-by: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: Tony Luck Cc: Russell King Cc: Martin Schwidefsky Cc: James Hogan Cc: Ralf Baechle Cc: David Miller Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 28cec518f4d4..75ac479cbacd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -267,6 +267,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + boolean + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -- cgit v1.2.3 From c46a7c817e662a820373bb76b88d0ad67d6abe5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:06:30 -0700 Subject: x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels _PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting faults on x86. Care is taken such that _PAGE_NUMA is used only in situations where the VMA flags distinguish between NUMA hinting faults and prot_none faults. This decision was x86-specific and conceptually it is difficult requiring special casing to distinguish between PROTNONE and NUMA ptes based on context. Fundamentally, we only need the _PAGE_NUMA bit to tell the difference between an entry that is really unmapped and a page that is protected for NUMA hinting faults as if the PTE is not present then a fault will be trapped. Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset. This patch shrinks the maximum possible swap size and uses the bit to uniquely distinguish between NUMA hinting ptes and swap ptes. Signed-off-by: Mel Gorman Cc: David Vrabel Cc: Ingo Molnar Cc: Peter Anvin Cc: Fengguang Wu Cc: Linus Torvalds Cc: Steven Noonan Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Srikar Dronamraju Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e302ae1dcce0..0897830011f3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -756,7 +756,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) + if (likely(!pte_special(pte) || pte_numa(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; @@ -782,14 +782,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - if (is_zero_pfn(pfn)) - return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } + if (is_zero_pfn(pfn)) + return NULL; + /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. @@ -1722,13 +1723,9 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* - * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault - * would be called on PROT_NONE ranges. We must never invoke - * handle_mm_fault on PROT_NONE ranges or the NUMA hinting - * page faults would unprotect the PROT_NONE ranges if - * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd - * bitflag. So to avoid that, don't set FOLL_NUMA if - * FOLL_FORCE is set. + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space */ if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA; -- cgit v1.2.3 From f9f58285947d9c88079bfb7b7666c987011e3377 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:34 -0700 Subject: mm/slub.c: convert printk to pr_foo() All printk(KERN_foo converted to pr_foo() Default printk converted to pr_warn() Coalesce format fragments Signed-off-by: Fabian Frederick Acked-by: Christoph Lameter Cc: Joe Perches Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 129 +++++++++++++++++++++++++++----------------------------------- 1 file changed, 57 insertions(+), 72 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 2b1ce697fc4b..1594b14e2597 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -403,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif return 0; @@ -444,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif return 0; @@ -546,14 +546,14 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { int i; for (i = 0; i < TRACK_ADDRS_COUNT; i++) if (t->addrs[i]) - printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + pr_err("\t%pS\n", (void *)t->addrs[i]); else break; } @@ -571,8 +571,7 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR - "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", page, page->objects, page->inuse, page->freelist, page->flags); } @@ -585,11 +584,9 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_ERR "========================================" - "=====================================\n"); - printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); - printk(KERN_ERR "----------------------------------------" - "-------------------------------------\n\n"); + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %s\n", s->name, print_tainted(), buf); + pr_err("-----------------------------------------------------------------------------\n\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } @@ -602,7 +599,7 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_ERR "FIX %s: %s\n", s->name, buf); + pr_err("FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) @@ -614,8 +611,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", - p, p - addr, get_freepointer(s, p)); + pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); @@ -698,7 +695,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault[0], value); print_trailer(s, page, object); @@ -931,7 +928,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", object, page->inuse, @@ -1134,9 +1131,8 @@ static noinline struct kmem_cache_node *free_debug_processing( slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); } else if (!page->slab_cache) { - printk(KERN_ERR - "SLUB : no slab for object 0x%p.\n", - object); + pr_err("SLUB : no slab for object 0x%p.\n", + object); dump_stack(); } else object_err(s, page, object, @@ -1219,8 +1215,8 @@ static int __init setup_slub_debug(char *str) slub_debug |= SLAB_FAILSLAB; break; default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n", *str); + pr_err("slub_debug option '%c' unknown. skipped\n", + *str); } } @@ -1770,19 +1766,19 @@ static inline void note_cmpxchg_failure(const char *n, #ifdef SLUB_DEBUG_CMPXCHG unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + pr_info("%s %s: cmpxchg redo ", n, s->name); #ifdef CONFIG_PREEMPT if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) - printk("due to cpu change %d -> %d\n", + pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); else #endif if (tid_to_event(tid) != tid_to_event(actual_tid)) - printk("due to cpu running other code. Event %ld->%ld\n", + pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); else - printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); @@ -2154,16 +2150,15 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { int node; - printk(KERN_WARNING - "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->object_size, - s->size, oo_order(s->oo), oo_order(s->min)); + pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - printk(KERN_WARNING " %s debugging increased min order, use " - "slub_debug=O to disable.\n", s->name); + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -2178,8 +2173,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); - printk(KERN_WARNING - " node %d: slabs: %ld, objs: %ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } } @@ -2894,10 +2888,8 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!page); if (page_to_nid(page) != node) { - printk(KERN_ERR "SLUB: Unable to allocate memory from " - "node %d\n", node); - printk(KERN_ERR "SLUB: Allocating a useless per node structure " - "in order to be able to continue\n"); + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } n = page->freelist; @@ -3182,8 +3174,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(slab_index(p, s, addr), map)) { - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", - p, p - addr); + pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } @@ -3650,9 +3641,7 @@ void __init kmem_cache_init(void) register_cpu_notifier(&slab_notifier); #endif - printk(KERN_INFO - "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," - " CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -3934,8 +3923,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != n->nr_partial) - printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " - "counter=%ld\n", s->name, count, n->nr_partial); + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); if (!(s->flags & SLAB_STORE_USER)) goto out; @@ -3945,9 +3934,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != atomic_long_read(&n->nr_slabs)) - printk(KERN_ERR "SLUB: %s %ld slabs counted but " - "counter=%ld\n", s->name, count, - atomic_long_read(&n->nr_slabs)); + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); out: spin_unlock_irqrestore(&n->list_lock, flags); @@ -4211,53 +4199,50 @@ static void resiliency_test(void) BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); + pr_err("SLUB resiliency testing\n"); + pr_err("-----------------------\n"); + pr_err("A. Corruption after allocation\n"); p = kzalloc(16, GFP_KERNEL); p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); + pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", + p + 16); validate_slab_cache(kmalloc_caches[4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[6]); - printk(KERN_ERR "\nB. Corruption after free\n"); + pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); + pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[9]); } #else @@ -5303,7 +5288,7 @@ static int __init slab_sysfs_init(void) slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); - printk(KERN_ERR "Cannot register slab subsystem.\n"); + pr_err("Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -5312,8 +5297,8 @@ static int __init slab_sysfs_init(void) list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab %s" - " to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); } while (alias_list) { @@ -5322,8 +5307,8 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", al->name); + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); kfree(al); } -- cgit v1.2.3 From ecc42fbe952fa4aae88c2413e21912b1d665fb93 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:35 -0700 Subject: mm/slub.c: convert vnsprintf-static to va_format Inspired by Joe Perches suggestion in ntfs logging clean-up. Signed-off-by: Fabian Frederick Acked-by: Christoph Lameter Cc: Joe Perches Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 1594b14e2597..de99d500af6c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -578,28 +578,30 @@ static void print_page_info(struct page *page) static void slab_bug(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %s\n", s->name, print_tainted(), buf); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + va_end(args); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); va_end(args); - pr_err("FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) -- cgit v1.2.3 From 9a02d699935c9acdfefe431bbc33771d1d87da7f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:06:36 -0700 Subject: mm, slab: suppress out of memory warning unless debug is enabled When the slab or slub allocators cannot allocate additional slab pages, they emit diagnostic information to the kernel log such as current number of slabs, number of objects, active objects, etc. This is always coupled with a page allocation failure warning since it is controlled by !__GFP_NOWARN. Suppress this out of memory warning if the allocator is configured without debug supported. The page allocation failure warning will indicate it is a failed slab allocation, the order, and the gfp mask, so this is only useful to diagnose allocator issues. Since CONFIG_SLUB_DEBUG is already enabled by default for the slub allocator, there is no functional change with this patch. If debug is disabled, however, the warnings are now suppressed. Signed-off-by: David Rientjes Cc: Pekka Enberg Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 10 ++++++++-- mm/slub.c | 29 +++++++++++++++++------------ 2 files changed, 25 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 19d92181ce24..5c846d25c17d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1621,10 +1621,16 @@ __initcall(cpucache_init); static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { +#if DEBUG struct kmem_cache_node *n; struct page *page; unsigned long flags; int node; + static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) + return; printk(KERN_WARNING "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", @@ -1662,6 +1668,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } +#endif } /* @@ -1683,8 +1690,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { - if (!(flags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(cachep, flags, nodeid); + slab_out_of_memory(cachep, flags, nodeid); return NULL; } diff --git a/mm/slub.c b/mm/slub.c index de99d500af6c..65a0a5c57f31 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2119,11 +2119,19 @@ static inline int node_match(struct page *page, int node) return 1; } +#ifdef CONFIG_SLUB_DEBUG static int count_free(struct page *page) { return page->objects - page->inuse; } +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct page *)) { @@ -2137,21 +2145,19 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } - -static inline unsigned long node_nr_objs(struct kmem_cache_node *n) -{ -#ifdef CONFIG_SLUB_DEBUG - return atomic_long_read(&n->total_objects); -#else - return 0; -#endif -} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); int node; + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", @@ -2178,6 +2184,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } +#endif } static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, @@ -2356,9 +2363,7 @@ new_slab: freelist = new_slab_objects(s, gfpflags, node, &c); if (unlikely(!freelist)) { - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - + slab_out_of_memory(s, gfpflags, node); local_irq_restore(flags); return NULL; } -- cgit v1.2.3 From 8eae1492675d0ffc12189f8db573624413232e15 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:06:37 -0700 Subject: mm: slub: fix ALLOC_SLOWPATH stat There used to be only one path out of __slab_alloc(), and ALLOC_SLOWPATH got bumped in that exit path. Now there are two, and a bunch of gotos. ALLOC_SLOWPATH can now get set more than once during a single call to __slab_alloc() which is pretty bogus. Here's the sequence: 1. Enter __slab_alloc(), fall through all the way to the stat(s, ALLOC_SLOWPATH); 2. hit 'if (!freelist)', and bump DEACTIVATE_BYPASS, jump to new_slab (goto #1) 3. Hit 'if (c->partial)', bump CPU_PARTIAL_ALLOC, goto redo (goto #2) 4. Fall through in the same path we did before all the way to stat(s, ALLOC_SLOWPATH) 5. bump ALLOC_REFILL stat, then return Doing this is obviously bogus. It keeps us from being able to accurately compare ALLOC_SLOWPATH vs. ALLOC_FASTPATH. It also means that the total number of allocs always exceeds the total number of frees. This patch moves stat(s, ALLOC_SLOWPATH) to be called from the same place that __slab_alloc() is. This makes it much less likely that ALLOC_SLOWPATH will get botched again in the spaghetti-code inside __slab_alloc(). Signed-off-by: Dave Hansen Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 65a0a5c57f31..d05a5483106d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2326,8 +2326,6 @@ redo: if (freelist) goto load_freelist; - stat(s, ALLOC_SLOWPATH); - freelist = get_freelist(s, page); if (!freelist) { @@ -2432,10 +2430,10 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) + if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); - - else { + stat(s, ALLOC_SLOWPATH); + } else { void *next_object = get_freepointer_safe(s, object); /* -- cgit v1.2.3 From 5dfb417509921eb90ee123a4d1525e8916b4ace4 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:38 -0700 Subject: sl[au]b: charge slabs to kmemcg explicitly We have only a few places where we actually want to charge kmem so instead of intruding into the general page allocation path with __GFP_KMEMCG it's better to explictly charge kmem there. All kmem charges will be easier to follow that way. This is a step towards removing __GFP_KMEMCG. It removes __GFP_KMEMCG from memcg caches' allocflags. Instead it makes slab allocation path call memcg_charge_kmem directly getting memcg to charge from the cache's memcg params. This also eliminates any possibility of misaccounting an allocation going from one memcg's cache to another memcg, because now we always charge slabs against the memcg the cache belongs to. That's why this patch removes the big comment to memcg_kmem_get_cache. Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- mm/slab.c | 7 ++++++- mm/slab.h | 29 +++++++++++++++++++++++++++++ mm/slab_common.c | 6 +----- mm/slub.c | 24 +++++++++++++++++------- 5 files changed, 55 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5177c6d4a2dd..56a768b3d5a8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2953,7 +2953,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) } #endif -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; int ret = 0; @@ -2991,7 +2991,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { res_counter_uncharge(&memcg->res, size); if (do_swap_account) diff --git a/mm/slab.c b/mm/slab.c index 5c846d25c17d..944ac58cfcf8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1688,8 +1688,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; + if (memcg_charge_slab(cachep, flags, cachep->gfporder)) + return NULL; + page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { + memcg_uncharge_slab(cachep, cachep->gfporder); slab_out_of_memory(cachep, flags, nodeid); return NULL; } @@ -1747,7 +1751,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_memcg_kmem_pages(page, cachep->gfporder); + __free_pages(page, cachep->gfporder); + memcg_uncharge_slab(cachep, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index 6bd4c353704f..863e67b8c8c9 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -192,6 +192,26 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; return s->memcg_params->root_cache; } + +static __always_inline int memcg_charge_slab(struct kmem_cache *s, + gfp_t gfp, int order) +{ + if (!memcg_kmem_enabled()) + return 0; + if (is_root_cache(s)) + return 0; + return memcg_charge_kmem(s->memcg_params->memcg, gfp, + PAGE_SIZE << order); +} + +static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ + if (!memcg_kmem_enabled()) + return; + if (is_root_cache(s)) + return; + memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order); +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -227,6 +247,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { return s; } + +static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index 102cc6fca3d3..06f0c6125632 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -290,12 +290,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); - if (IS_ERR(s)) { + if (IS_ERR(s)) kfree(cache_name); - goto out_unlock; - } - - s->allocflags |= __GFP_KMEMCG; out_unlock: mutex_unlock(&slab_mutex); diff --git a/mm/slub.c b/mm/slub.c index d05a5483106d..fc9831851be6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1312,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Slab allocation and freeing */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) { + struct page *page; int order = oo_order(oo); flags |= __GFP_NOTRACK; + if (memcg_charge_slab(s, flags, order)) + return NULL; + if (node == NUMA_NO_NODE) - return alloc_pages(flags, order); + page = alloc_pages(flags, order); else - return alloc_pages_exact_node(node, flags, order); + page = alloc_pages_exact_node(node, flags, order); + + if (!page) + memcg_uncharge_slab(s, order); + + return page; } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1344,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; alloc_gfp = flags; @@ -1352,7 +1361,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (page) stat(s, ORDER_FALLBACK); @@ -1468,7 +1477,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_memcg_kmem_pages(page, order); + __free_pages(page, order); + memcg_uncharge_slab(s, order); } #define need_reserve_slab_rcu \ -- cgit v1.2.3 From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:39 -0700 Subject: mm: get rid of __GFP_KMEMCG Currently to allocate a page that should be charged to kmemcg (e.g. threadinfo), we pass __GFP_KMEMCG flag to the page allocator. The page allocated is then to be freed by free_memcg_kmem_pages. Apart from looking asymmetrical, this also requires intrusion to the general allocation path. So let's introduce separate functions that will alloc/free pages charged to kmemcg. The new functions are called alloc_kmem_pages and free_kmem_pages. They should be used when the caller actually would like to use kmalloc, but has to fall back to the page allocator for the allocation is large. They only differ from alloc_pages and free_pages in that besides allocating or freeing pages they also charge them to the kmem resource counter of the current memory cgroup. [sfr@canb.auug.org.au: export kmalloc_order() to modules] Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 ++++++----- mm/page_alloc.c | 56 +++++++++++++++++++++++++++++++++++--------------------- mm/slab_common.c | 13 +++++++++++++ mm/slub.c | 6 +++--- 4 files changed, 57 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56a768b3d5a8..7bab1de50f48 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3540,11 +3540,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba2933c9c0..7cfdcd808f52 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,7 +2697,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2716,13 +2715,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2782,8 +2774,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2837,27 +2827,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } diff --git a/mm/slab_common.c b/mm/slab_common.c index 06f0c6125632..1950c8f4d1a6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,6 +582,19 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { diff --git a/mm/slub.c b/mm/slub.c index fc9831851be6..ddb60795f373 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3311,8 +3311,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3381,7 +3381,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); -- cgit v1.2.3 From 0bf073315cb29d2e9e68b6c5da97862a519e3320 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:06:41 -0700 Subject: mm: softdirty: make freshly remapped file pages being softdirty unconditionally Hugh reported: | I noticed your soft_dirty work in install_file_pte(): which looked | good at first, until I realized that it's propagating the soft_dirty | of a pte it's about to zap completely, to the unrelated entry it's | about to insert in its place. Which seems very odd to me. Indeed this code ends up being nop in result -- pte_file_mksoft_dirty() operates with pte_t argument and returns new pte_t which were never used after. After looking more I think what we need is to soft-dirtify all newely remapped file pages because it should look like a new mapping for memory tracker. Signed-off-by: Cyrill Gorcunov Reported-by: Hugh Dickins Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 34feba60a17e..2c5646f11f41 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, ptfile = pgoff_to_pte(pgoff); - if (!pte_none(*pte)) { - if (pte_present(*pte) && pte_soft_dirty(*pte)) - pte_file_mksoft_dirty(ptfile); + if (!pte_none(*pte)) zap_pte(mm, vma, addr, pte); - } - set_pte_at(mm, addr, pte, ptfile); + set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a -- cgit v1.2.3 From b43790eedd31e9535b89bbfa45793919e9504c34 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:06:42 -0700 Subject: mm: softdirty: don't forget to save file map softdiry bit on unmap pte_file_mksoft_dirty operates with argument passed by a value and returns modified result thus we need to assign @ptfile here, otherwise itis a no-op which may lead to loss of the softdirty bit. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 10aef960d3d0..7da400d5d98e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1361,7 +1361,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (page->index != linear_page_index(vma, address)) { pte_t ptfile = pgoff_to_pte(page->index); if (pte_soft_dirty(pteval)) - pte_file_mksoft_dirty(ptfile); + ptfile = pte_file_mksoft_dirty(ptfile); set_pte_at(mm, address, pte, ptfile); } -- cgit v1.2.3 From 6f04f48dc9c0433e2bb687f5f7f7af1aba97b04d Mon Sep 17 00:00:00 2001 From: Suleiman Souhlal Date: Wed, 4 Jun 2014 16:06:44 -0700 Subject: mm: only force scan in reclaim when none of the LRUs are big enough. Prior to this change, we would decide whether to force scan a LRU during reclaim if that LRU itself was too small for the current priority. However, this can lead to the file LRU getting force scanned even if there are a lot of anonymous pages we can reclaim, leading to hot file pages getting needlessly reclaimed. To address this, we instead only force scan when none of the reclaimable LRUs are big enough. Gives huge improvements with zswap. For example, when doing -j20 kernel build in a 500MB container with zswap enabled, runtime (in seconds) is greatly reduced: x without this change + with this change N Min Max Median Avg Stddev x 5 700.997 790.076 763.928 754.05 39.59493 + 5 141.634 197.899 155.706 161.9 21.270224 Difference at 95.0% confidence -592.15 +/- 46.3521 -78.5293% +/- 6.14709% (Student's t, pooled s = 31.7819) Should also give some improvements in regular (non-zswap) swap cases. Yes, hughd found significant speedup using regular swap, with several memcgs under pressure; and it should also be effective in the non-memcg case, whenever one or another zone LRU is forced too small. Signed-off-by: Suleiman Souhlal Signed-off-by: Hugh Dickins Cc: Suleiman Souhlal Cc: Mel Gorman Acked-by: Rik van Riel Acked-by: Rafael Aquini Cc: Michal Hocko Cc: Yuanhan Liu Cc: Seth Jennings Cc: Bob Liu Cc: Minchan Kim Cc: Luigi Semenzato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 66 ++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 32c661d66a45..7901cb749e17 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1866,6 +1866,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, bool force_scan = false; unsigned long ap, fp; enum lru_list lru; + bool some_scanned; + int pass; /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1989,39 +1991,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - size = get_lru_size(lruvec, lru); - scan = size >> sc->priority; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; - if (!scan && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + nr[lru] = scan; /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. + * Skip the second pass and don't force_scan, + * if we found something to scan. */ - scan = div64_u64(scan * fraction[file], denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) - scan = 0; - break; - default: - /* Look ma, no brain */ - BUG(); + some_scanned |= !!scan; } - nr[lru] = scan; } } -- cgit v1.2.3 From 4f115147ff802267d0aa41e361c5aa5bd933d896 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 4 Jun 2014 16:06:46 -0700 Subject: mm,vmacache: add debug data Introduce a CONFIG_DEBUG_VM_VMACACHE option to enable counting the cache hit rate -- exported in /proc/vmstat. Any updates to the caching scheme needs this kind of data, thus it can save some work re-implementing the counting all the time. Signed-off-by: Davidlohr Bueso Cc: Aswin Chandramouleeswaran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmacache.c | 12 ++++++++++-- mm/vmstat.c | 4 ++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmacache.c b/mm/vmacache.c index 1037a3bab505..658ed3b3e38d 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -78,6 +78,8 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) if (!vmacache_valid(mm)) return NULL; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; @@ -85,8 +87,10 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) continue; if (WARN_ON_ONCE(vma->vm_mm != mm)) break; - if (vma->vm_start <= addr && vma->vm_end > addr) + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; + } } return NULL; @@ -102,11 +106,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, if (!vmacache_valid(mm)) return NULL; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; - if (vma && vma->vm_start == start && vma->vm_end == end) + if (vma && vma->vm_start == start && vma->vm_end == end) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; + } } return NULL; diff --git a/mm/vmstat.c b/mm/vmstat.c index 302dd076b8bf..82ce17ce58c4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -866,6 +866,10 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ +#ifdef CONFIG_DEBUG_VM_VMACACHE + "vmacache_find_calls", + "vmacache_find_hits", +#endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ -- cgit v1.2.3 From 6b4ebc3a9078c5b7b8c4cf495a0b1d2d0e0bfe7a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 4 Jun 2014 16:06:47 -0700 Subject: mm,vmacache: optimize overflow system-wide flushing For single threaded workloads, we can avoid flushing and iterating through the entire list of tasks, making the whole function a lot faster, requiring only a single atomic read for the mm_users. Signed-off-by: Davidlohr Bueso Suggested-by: Oleg Nesterov Cc: Aswin Chandramouleeswaran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmacache.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/vmacache.c b/mm/vmacache.c index 658ed3b3e38d..9f25af825dec 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm) { struct task_struct *g, *p; + /* + * Single threaded tasks need not iterate the entire + * list of process. We can avoid the flushing as well + * since the mm's seqnum was increased and don't have + * to worry about other threads' seqnum. Current's + * flush will occur upon the next lookup. + */ + if (atomic_read(&mm->mm_users) == 1) + return; + rcu_read_lock(); for_each_process_thread(g, p) { /* -- cgit v1.2.3 From 2bfc2862c4fe38379a2fb2cfba33fad32ccb4ff4 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:53 -0700 Subject: memblock: introduce memblock_alloc_range() This introduces memblock_alloc_range() which allocates memblock from the specified range of physical address. I would like to use this function to specify the location of CMA. Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index a810ba923cdd..146736411318 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1033,22 +1033,35 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, - phys_addr_t align, phys_addr_t max_addr, - int nid) +static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) { phys_addr_t found; if (!align) align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, 0, max_addr, nid); + found = memblock_find_in_range_node(size, align, start, end, nid); if (found && !memblock_reserve(found, size)) return found; return 0; } +phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, + phys_addr_t start, phys_addr_t end) +{ + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); +} + +static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t max_addr, + int nid) +{ + return memblock_alloc_range_nid(size, align, 0, max_addr, nid); +} + phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); -- cgit v1.2.3 From ff9e43eb4f2eb78067d7b783cc893773b3e129b1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:06:57 -0700 Subject: thp: consolidate assert checks in __split_huge_page() It doesn't make sense to have two assert checks for each invariant: one for printing and one for BUG(). Let's trigger BUG() if we print error message. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d199d2d91946..2434d9059e5c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1830,10 +1830,11 @@ static void __split_huge_page(struct page *page, * the newly established pmd of the child later during the * walk, to be able to set it as pmd_trans_splitting too. */ - if (mapcount != page_mapcount(page)) + if (mapcount != page_mapcount(page)) { printk(KERN_ERR "mapcount %d page_mapcount %d\n", mapcount, page_mapcount(page)); - BUG_ON(mapcount != page_mapcount(page)); + BUG(); + } __split_huge_page_refcount(page, list); @@ -1844,10 +1845,11 @@ static void __split_huge_page(struct page *page, BUG_ON(is_vma_temporary_stack(vma)); mapcount2 += __split_huge_page_map(page, vma, addr); } - if (mapcount != mapcount2) + if (mapcount != mapcount2) { printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", mapcount, mapcount2, page_mapcount(page)); - BUG_ON(mapcount != mapcount2); + BUG(); + } } /* -- cgit v1.2.3 From ae3a8c1c235345dfeb9b4b8c9e118802e3e84533 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 16:06:58 -0700 Subject: mm/huge_memory.c: complete conversion to pr_foo() It was using a mix of pr_foo() and printk(KERN_ERR ...). Cc: Rik van Riel Cc: Mel Gorman Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2434d9059e5c..e60837dc785c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -5,6 +5,8 @@ * the COPYING file in the top-level directory. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -151,8 +153,7 @@ static int start_khugepaged(void) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (unlikely(IS_ERR(khugepaged_thread))) { - printk(KERN_ERR - "khugepaged: kthread_run(khugepaged) failed\n"); + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } @@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); + pr_err("failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); + pr_err("failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); + pr_err("failed to register transparent hugepage group\n"); goto remove_hp_group; } @@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str) } out: if (!ret) - printk(KERN_WARNING - "transparent_hugepage= cannot parse, ignored\n"); + pr_warn("transparent_hugepage= cannot parse, ignored\n"); return ret; } __setup("transparent_hugepage=", setup_transparent_hugepage); @@ -1831,8 +1831,8 @@ static void __split_huge_page(struct page *page, * walk, to be able to set it as pmd_trans_splitting too. */ if (mapcount != page_mapcount(page)) { - printk(KERN_ERR "mapcount %d page_mapcount %d\n", - mapcount, page_mapcount(page)); + pr_err("mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); BUG(); } @@ -1846,8 +1846,8 @@ static void __split_huge_page(struct page *page, mapcount2 += __split_huge_page_map(page, vma, addr); } if (mapcount != mapcount2) { - printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", - mapcount, mapcount2, page_mapcount(page)); + pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); BUG(); } } -- cgit v1.2.3 From 8bf8fcb07653fbaea74f96bba1e4ed0f851675ab Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 4 Jun 2014 16:07:00 -0700 Subject: mm/mempool: warn about __GFP_ZERO usage Memory obtained via mempool_alloc is not always zeroed even when called with __GFP_ZERO. Add a note and VM_BUG_ON statement to make that clear. [akpm@linux-foundation.org: use VM_WARN_ON_ONCE] Signed-off-by: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index 905434f18c97..455d468c3a5d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -192,6 +192,7 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) + * Note: using __GFP_ZERO is not supported. */ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { @@ -200,6 +201,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) wait_queue_t wait; gfp_t gfp_temp; + VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_sleep_if(gfp_mask & __GFP_WAIT); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ -- cgit v1.2.3 From 3dae7fec5e884a4e72e5416db0894de66f586201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 4 Jun 2014 16:07:01 -0700 Subject: mm: memcontrol: remove hierarchy restrictions for swappiness and oom_control Per-memcg swappiness and oom killing can currently not be tweaked on a memcg that is part of a hierarchy, but not the root of that hierarchy. Users have complained that they can't configure this when they turned on hierarchy mode. In fact, with hierarchy mode becoming the default, this restriction disables the tunables entirely. But there is no good reason for this restriction. The settings for swappiness and OOM killing are taken from whatever memcg whose limit triggered reclaim and OOM invocation, regardless of its position in the hierarchy tree. Allow setting swappiness on any group. The knob on the root memcg already reads the global VM swappiness, make it writable as well. Allow disabling the OOM killer on any non-root memcg. Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bab1de50f48..20f47d9cd8b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5444,22 +5444,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - if (val > 100 || !parent) + if (val > 100) return -EINVAL; - mutex_lock(&memcg_create_mutex); - - /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } - - memcg->swappiness = val; - - mutex_unlock(&memcg_create_mutex); + if (css_parent(css)) + memcg->swappiness = val; + else + vm_swappiness = val; return 0; } @@ -5791,22 +5783,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!parent || !((val == 0) || (val == 1))) + if (!css_parent(css) || !((val == 0) || (val == 1))) return -EINVAL; - mutex_lock(&memcg_create_mutex); - /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - mutex_unlock(&memcg_create_mutex); + return 0; } -- cgit v1.2.3 From cea371f4f39ced101d27264eddb8cf8c749fdd00 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:04 -0700 Subject: slab: document kmalloc_order Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 1950c8f4d1a6..2834bc2886fd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,6 +582,11 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) { void *ret; -- cgit v1.2.3 From 14bd5b458bf62f84b2639ae288fd83d1da7a9af6 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Wed, 4 Jun 2014 16:07:05 -0700 Subject: mm/mmap.c: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO Fix a coccinelle error regarding usage of IS_ERR and PTR_ERR instead of PTR_ERR_OR_ZERO. Signed-off-by: Duan Jiong Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index b1202cf81f4b..6cdec3a6f4bf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2965,9 +2965,7 @@ int install_special_mapping(struct mm_struct *mm, struct vm_area_struct *vma = _install_special_mapping(mm, addr, len, vm_flags, pages); - if (IS_ERR(vma)) - return PTR_ERR(vma); - return 0; + return PTR_ERR_OR_ZERO(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); -- cgit v1.2.3 From 2906dd52831b6049e1d4d9b12f6f234bf2f64a03 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:06 -0700 Subject: hugetlb: prep_compound_gigantic_page(): drop __init marker The HugeTLB subsystem uses the buddy allocator to allocate hugepages during runtime. This means that hugepages allocation during runtime is limited to MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes greater than MAX_ORDER), this in turn means that those pages can't be allocated at runtime. HugeTLB supports gigantic page allocation during boottime, via the boot allocator. To this end the kernel provides the command-line options hugepagesz= and hugepages=, which can be used to instruct the kernel to allocate N gigantic pages during boot. For example, x86_64 supports 2M and 1G hugepages, but only 2M hugepages can be allocated and freed at runtime. If one wants to allocate 1G gigantic pages, this has to be done at boot via the hugepagesz= and hugepages= command-line options. Now, gigantic page allocation at boottime has two serious problems: 1. Boottime allocation is not NUMA aware. On a NUMA machine the kernel evenly distributes boottime allocated hugepages among nodes. For example, suppose you have a four-node NUMA machine and want to allocate four 1G gigantic pages at boottime. The kernel will allocate one gigantic page per node. On the other hand, we do have users who want to be able to specify which NUMA node gigantic pages should allocated from. So that they can place virtual machines on a specific NUMA node. 2. Gigantic pages allocated at boottime can't be freed At this point it's important to observe that regular hugepages allocated at runtime don't have those problems. This is so because HugeTLB interface for runtime allocation in sysfs supports NUMA and runtime allocated pages can be freed just fine via the buddy allocator. This series adds support for allocating gigantic pages at runtime. It does so by allocating gigantic pages via CMA instead of the buddy allocator. Releasing gigantic pages is also supported via CMA. As this series builds on top of the existing HugeTLB interface, it makes gigantic page allocation and releasing just like regular sized hugepages. This also means that NUMA support just works. For example, to allocate two 1G gigantic pages on node 1, one can do: # echo 2 > \ /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages And, to release all gigantic pages on the same node: # echo 0 > \ /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages Please, refer to patch 5/5 for full technical details. Finally, please note that this series is a follow up for a previous series that tried to extend the command-line options set to be NUMA aware: http://marc.info/?l=linux-mm&m=139593335312191&w=2 During the discussion of that series it was agreed that having runtime allocation support for gigantic pages was a better solution. This patch (of 5): This function is going to be used by non-init code in a future commit. Signed-off-by: Luiz Capitulino Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Cc: Marcelo Tosatti Cc: Andrea Arcangeli Cc: Davidlohr Bueso Cc: David Rientjes Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Rik van Riel Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c82290b9c1fc..5d54d4b8df01 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -690,8 +690,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void __init prep_compound_gigantic_page(struct page *page, - unsigned long order) +static void prep_compound_gigantic_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; -- cgit v1.2.3 From bae7f4ae14d47008a11b4358b167cb0ae186c06a Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:08 -0700 Subject: hugetlb: add hstate_is_gigantic() Signed-off-by: Luiz Capitulino Reviewed-by: Andrea Arcangeli Reviewed-by: Naoya Horiguchi Reviewed-by: Yasuaki Ishimatsu Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Cc: David Rientjes Cc: Marcelo Tosatti Cc: Rik van Riel Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d54d4b8df01..a66310586894 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -611,7 +611,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; - VM_BUG_ON(h->order >= MAX_ORDER); + VM_BUG_ON(hstate_is_gigantic(h)); h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; @@ -664,7 +664,7 @@ static void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + if (h->surplus_huge_pages_node[nid] && !hstate_is_gigantic(h)) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -768,7 +768,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return NULL; page = alloc_pages_exact_node(nid, @@ -962,7 +962,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) struct page *page; unsigned int r_nid; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return NULL; /* @@ -1155,7 +1155,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return; nr_pages = min(unused_resv_pages, h->surplus_huge_pages); @@ -1355,7 +1355,7 @@ static void __init gather_bootmem_prealloc(void) * fix confusing memory reports from free(1) and another * side-effects, like CommitLimit going negative. */ - if (h->order > (MAX_ORDER - 1)) + if (hstate_is_gigantic(h)) adjust_managed_page_count(page, 1 << h->order); } } @@ -1365,7 +1365,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) unsigned long i; for (i = 0; i < h->max_huge_pages; ++i) { - if (h->order >= MAX_ORDER) { + if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_fresh_huge_page(h, @@ -1381,7 +1381,7 @@ static void __init hugetlb_init_hstates(void) for_each_hstate(h) { /* oversize hugepages were init'ed in early boot */ - if (h->order < MAX_ORDER) + if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } } @@ -1415,7 +1415,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, { int i; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return; for_each_node_mask(i, *nodes_allowed) { @@ -1478,7 +1478,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, { unsigned long min_count, ret; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return h->max_huge_pages; /* @@ -1605,7 +1605,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, goto out; h = kobj_to_hstate(kobj, &nid); - if (h->order >= MAX_ORDER) { + if (hstate_is_gigantic(h)) { err = -EINVAL; goto out; } @@ -1688,7 +1688,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return -EINVAL; err = kstrtoul(buf, 10, &input); @@ -2112,7 +2112,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, tmp = h->max_huge_pages; - if (write && h->order >= MAX_ORDER) + if (write && hstate_is_gigantic(h)) return -EINVAL; table->data = &tmp; @@ -2168,7 +2168,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, tmp = h->nr_overcommit_huge_pages; - if (write && h->order >= MAX_ORDER) + if (write && hstate_is_gigantic(h)) return -EINVAL; table->data = &tmp; -- cgit v1.2.3 From a7407a27c2bba3711d272d72d2d63ea147a929df Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:09 -0700 Subject: hugetlb: update_and_free_page(): don't clear PG_reserved bit Hugepages pages never get the PG_reserved bit set, so don't clear it. However, note that if the bit gets mistakenly set free_pages_check() will catch it. Signed-off-by: Luiz Capitulino Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Cc: Andrea Arcangeli Cc: David Rientjes Cc: Marcelo Tosatti Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a66310586894..c148eb295d79 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -618,8 +618,8 @@ static void update_and_free_page(struct hstate *h, struct page *page) for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | - 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1 << PG_writeback); + 1 << PG_active | 1 << PG_private | + 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); set_compound_page_dtor(page, NULL); -- cgit v1.2.3 From 1cac6f2c072abe2510f56fec6729a892aa827f62 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:11 -0700 Subject: hugetlb: move helpers up in the file Next commit will add new code which will want to call for_each_node_mask_to_alloc() macro. Move it, its buddy for_each_node_mask_to_free() and their dependencies up in the file so the new code can use them. This is just code movement, no logic change. Signed-off-by: Luiz Capitulino Reviewed-by: Andrea Arcangeli Reviewed-by: Naoya Horiguchi Reviewed-by: Yasuaki Ishimatsu Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Cc: David Rientjes Cc: Marcelo Tosatti Cc: Rik van Riel Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 146 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 73 insertions(+), 73 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c148eb295d79..5964d0de1777 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -607,6 +607,79 @@ err: return NULL; } +/* + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. + */ +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + static void update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -786,79 +859,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return page; } -/* - * common helper functions for hstate_next_node_to_{alloc|free}. - * We may have allocated or freed a huge page based on a different - * nodes_allowed previously, so h->next_node_to_{alloc|free} might - * be outside of *nodes_allowed. Ensure that we use an allowed - * node for alloc or free. - */ -static int next_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); - VM_BUG_ON(nid >= MAX_NUMNODES); - - return nid; -} - -static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - if (!node_isset(nid, *nodes_allowed)) - nid = next_node_allowed(nid, nodes_allowed); - return nid; -} - -/* - * returns the previously saved node ["this node"] from which to - * allocate a persistent huge page for the pool and advance the - * next node from which to allocate, handling wrap at end of node - * mask. - */ -static int hstate_next_node_to_alloc(struct hstate *h, - nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); - h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -/* - * helper for free_pool_huge_page() - return the previously saved - * node ["this node"] from which to free a huge page. Advance the - * next node id whether or not we find a free huge page to free so - * that the next attempt to free addresses the next node. - */ -static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); - h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ - nr_nodes--) - -#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_free(hs, mask)) || 1); \ - nr_nodes--) - static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; -- cgit v1.2.3 From 944d9fec8d7aee3f2e16573e9b6a16634b33f403 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:13 -0700 Subject: hugetlb: add support for gigantic page allocation at runtime HugeTLB is limited to allocating hugepages whose size are less than MAX_ORDER order. This is so because HugeTLB allocates hugepages via the buddy allocator. Gigantic pages (that is, pages whose size is greater than MAX_ORDER order) have to be allocated at boottime. However, boottime allocation has at least two serious problems. First, it doesn't support NUMA and second, gigantic pages allocated at boottime can't be freed. This commit solves both issues by adding support for allocating gigantic pages during runtime. It works just like regular sized hugepages, meaning that the interface in sysfs is the same, it supports NUMA, and gigantic pages can be freed. For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G gigantic pages on node 1, one can do: # echo 2 > \ /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages And to free them all: # echo 0 > \ /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages The one problem with gigantic page allocation at runtime is that it can't be serviced by the buddy allocator. To overcome that problem, this commit scans all zones from a node looking for a large enough contiguous region. When one is found, it's allocated by using CMA, that is, we call alloc_contig_range() to do the actual allocation. For example, on x86_64 we scan all zones looking for a 1GB contiguous region. When one is found, it's allocated by alloc_contig_range(). One expected issue with that approach is that such gigantic contiguous regions tend to vanish as runtime goes by. The best way to avoid this for now is to make gigantic page allocations very early during system boot, say from a init script. Other possible optimization include using compaction, which is supported by CMA but is not explicitly used by this commit. It's also important to note the following: 1. Gigantic pages allocated at boottime by the hugepages= command-line option can be freed at runtime just fine 2. This commit adds support for gigantic pages only to x86_64. The reason is that I don't have access to nor experience with other archs. The code is arch indepedent though, so it should be simple to add support to different archs 3. I didn't add support for hugepage overcommit, that is allocating a gigantic page on demand when /proc/sys/vm/nr_overcommit_hugepages > 0. The reason is that I don't think it's reasonable to do the hard and long work required for allocating a gigantic page at fault time. But it should be simple to add this if wanted [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Luiz Capitulino Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Reviewed-by: Yasuaki Ishimatsu Cc: Andrea Arcangeli Cc: David Rientjes Cc: Marcelo Tosatti Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 155 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5964d0de1777..98f0bc105dfe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -680,11 +680,150 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) +#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +static void destroy_compound_gigantic_page(struct page *page, + unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __ClearPageTail(p); + set_page_refcounted(p); + p->first_page = NULL; + } + + set_compound_order(page, 0); + __ClearPageHead(page); +} + +static void free_gigantic_page(struct page *page, unsigned order) +{ + free_contig_range(page_to_pfn(page), 1 << order); +} + +static int __alloc_gigantic_page(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long end_pfn = start_pfn + nr_pages; + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); +} + +static bool pfn_range_valid_gigantic(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + if (!pfn_valid(i)) + return false; + + page = pfn_to_page(i); + + if (PageReserved(page)) + return false; + + if (page_count(page) > 0) + return false; + + if (PageHuge(page)) + return false; + } + + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + return zone_spans_pfn(zone, last_pfn); +} + +static struct page *alloc_gigantic_page(int nid, unsigned order) +{ + unsigned long nr_pages = 1 << order; + unsigned long ret, pfn, flags; + struct zone *z; + + z = NODE_DATA(nid)->node_zones; + for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { + spin_lock_irqsave(&z->lock, flags); + + pfn = ALIGN(z->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(z, pfn, nr_pages)) { + if (pfn_range_valid_gigantic(pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&z->lock, flags); + ret = __alloc_gigantic_page(pfn, nr_pages); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&z->lock, flags); + } + pfn += nr_pages; + } + + spin_unlock_irqrestore(&z->lock, flags); + } + + return NULL; +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); +static void prep_compound_gigantic_page(struct page *page, unsigned long order); + +static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) +{ + struct page *page; + + page = alloc_gigantic_page(nid, huge_page_order(h)); + if (page) { + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, nid); + } + + return page; +} + +static int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) +{ + struct page *page = NULL; + int nr_nodes, node; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_gigantic_page_node(h, node); + if (page) + return 1; + } + + return 0; +} + +static inline bool gigantic_page_supported(void) { return true; } +#else +static inline bool gigantic_page_supported(void) { return false; } +static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void destroy_compound_gigantic_page(struct page *page, + unsigned long order) { } +static inline int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) { return 0; } +#endif + static void update_and_free_page(struct hstate *h, struct page *page) { int i; - VM_BUG_ON(hstate_is_gigantic(h)); + if (hstate_is_gigantic(h) && !gigantic_page_supported()) + return; h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; @@ -697,8 +836,13 @@ static void update_and_free_page(struct hstate *h, struct page *page) VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); set_compound_page_dtor(page, NULL); set_page_refcounted(page); - arch_release_hugepage(page); - __free_pages(page, huge_page_order(h)); + if (hstate_is_gigantic(h)) { + destroy_compound_gigantic_page(page, huge_page_order(h)); + free_gigantic_page(page, huge_page_order(h)); + } else { + arch_release_hugepage(page); + __free_pages(page, huge_page_order(h)); + } } struct hstate *size_to_hstate(unsigned long size) @@ -737,7 +881,7 @@ static void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid] && !hstate_is_gigantic(h)) { + if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -841,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (hstate_is_gigantic(h)) - return NULL; - page = alloc_pages_exact_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, @@ -1478,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, { unsigned long min_count, ret; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !gigantic_page_supported()) return h->max_huge_pages; /* @@ -1505,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h, nodes_allowed); + if (hstate_is_gigantic(h)) + ret = alloc_fresh_gigantic_page(h, nodes_allowed); + else + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -1605,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, goto out; h = kobj_to_hstate(kobj, &nid); - if (hstate_is_gigantic(h)) { + if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; } @@ -2112,7 +2256,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, tmp = h->max_huge_pages; - if (write && hstate_is_gigantic(h)) + if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) return -EINVAL; table->data = &tmp; -- cgit v1.2.3 From 4f9b16a64753d0bb607454347036dc997fd03b82 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:14 -0700 Subject: mm: disable zone_reclaim_mode by default When it was introduced, zone_reclaim_mode made sense as NUMA distances punished and workloads were generally partitioned to fit into a NUMA node. NUMA machines are now common but few of the workloads are NUMA-aware and it's routine to see major performance degradation due to zone_reclaim_mode being enabled but relatively few can identify the problem. Those that require zone_reclaim_mode are likely to be able to detect when it needs to be enabled and tune appropriately so lets have a sensible default for the bulk of users. This patch (of 2): zone_reclaim_mode causes processes to prefer reclaiming memory from local node instead of spilling over to other nodes. This made sense initially when NUMA machines were almost exclusively HPC and the workload was partitioned into nodes. The NUMA penalties were sufficiently high to justify reclaiming the memory. On current machines and workloads it is often the case that zone_reclaim_mode destroys performance but not all users know how to detect this. Favour the common case and disable it by default. Users that are sophisticated enough to know they need zone_reclaim_mode will detect it. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Zhang Yanfei Acked-by: Michal Hocko Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cfdcd808f52..dfe954fbb48a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1860,8 +1860,6 @@ static void __paginginit init_zone_allows_reclaim(int nid) for_each_node_state(i, N_MEMORY) if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); - else - zone_reclaim_mode = 1; } #else /* CONFIG_NUMA */ -- cgit v1.2.3 From 5f7a75acdb24c7b9c436b3a0a66eec12e101d19c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:15 -0700 Subject: mm: page_alloc: do not cache reclaim distances pgdat->reclaim_nodes tracks if a remote node is allowed to be reclaimed by zone_reclaim due to its distance. As it is expected that zone_reclaim_mode will be rarely enabled it is unreasonable for all machines to take a penalty. Fortunately, the zone_reclaim_mode() path is already slow and it is the path that takes the hit. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Zhang Yanfei Acked-by: Michal Hocko Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dfe954fbb48a..9f13bcfb6762 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1850,16 +1850,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); -} - -static void __paginginit init_zone_allows_reclaim(int nid) -{ - int i; - - for_each_node_state(i, N_MEMORY) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) - node_set(i, NODE_DATA(nid)->reclaim_nodes); + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -1893,9 +1885,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return true; } -static inline void init_zone_allows_reclaim(int nid) -{ -} #endif /* CONFIG_NUMA */ /* @@ -4933,8 +4922,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - if (node_state(nid, N_MEMORY)) - init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif -- cgit v1.2.3 From e8d9df3abac5d02dd4e6a0041cb62e69189b2c8e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:17 -0700 Subject: memcg: un-export __memcg_kmem_get_cache It is only used in slab and should not be used anywhere else so there is no need in exporting it. Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 20f47d9cd8b2..c1b816f61536 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3513,7 +3513,6 @@ out: rcu_read_unlock(); return cachep; } -EXPORT_SYMBOL(__memcg_kmem_get_cache); /* * We need to verify if the allocation against current->mm->owner's memcg is -- cgit v1.2.3 From bfc8c90139ebd049b9801a951db3b9a4a00bed9c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:18 -0700 Subject: mem-hotplug: implement get/put_online_mems kmem_cache_{create,destroy,shrink} need to get a stable value of cpu/node online mask, because they init/destroy/access per-cpu/node kmem_cache parts, which can be allocated or destroyed on cpu/mem hotplug. To protect against cpu hotplug, these functions use {get,put}_online_cpus. However, they do nothing to synchronize with memory hotplug - taking the slab_mutex does not eliminate the possibility of race as described in patch 2. What we need there is something like get_online_cpus, but for memory. We already have lock_memory_hotplug, which serves for the purpose, but it's a bit of a hammer right now, because it's backed by a mutex. As a result, it imposes some limitations to locking order, which are not desirable, and can't be used just like get_online_cpus. That's why in patch 1 I substitute it with get/put_online_mems, which work exactly like get/put_online_cpus except they block not cpu, but memory hotplug. [ v1 can be found at https://lkml.org/lkml/2014/4/6/68. I NAK'ed it by myself, because it used an rw semaphore for get/put_online_mems, making them dead lock prune. ] This patch (of 2): {un}lock_memory_hotplug, which is used to synchronize against memory hotplug, is currently backed by a mutex, which makes it a bit of a hammer - threads that only want to get a stable value of online nodes mask won't be able to proceed concurrently. Also, it imposes some strong locking ordering rules on it, which narrows down the set of its usage scenarios. This patch introduces get/put_online_mems, which are the same as get/put_online_cpus, but for memory hotplug, i.e. executing a code inside a get/put_online_mems section will guarantee a stable value of online nodes, present pages, etc. lock_memory_hotplug()/unlock_memory_hotplug() are removed altogether. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tang Chen Cc: Zhang Yanfei Cc: Toshi Kani Cc: Xishi Qiu Cc: Jiang Liu Cc: Rafael J. Wysocki Cc: David Rientjes Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 4 +- mm/memory-failure.c | 8 +-- mm/memory_hotplug.c | 142 +++++++++++++++++++++++++++++++++++++--------------- mm/slub.c | 4 +- mm/vmscan.c | 2 +- 5 files changed, 108 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8d2fcdfeff7f..736ade31d1dc 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1300,7 +1300,7 @@ static void kmemleak_scan(void) /* * Struct page scanning for each node. */ - lock_memory_hotplug(); + get_online_mems(); for_each_online_node(i) { unsigned long start_pfn = node_start_pfn(i); unsigned long end_pfn = node_end_pfn(i); @@ -1318,7 +1318,7 @@ static void kmemleak_scan(void) scan_block(page, page + 1, NULL, 1); } } - unlock_memory_hotplug(); + put_online_mems(); /* * Scanning the task stacks (may introduce false negatives). diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9ccef39a9de2..6917f799412b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1664,11 +1664,7 @@ int soft_offline_page(struct page *page, int flags) } } - /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); + get_online_mems(); /* * Isolate the page, so that it doesn't get reallocated if it @@ -1679,7 +1675,7 @@ int soft_offline_page(struct page *page, int flags) set_migratetype_isolate(page, true); ret = get_any_page(page, pfn, flags); - unlock_memory_hotplug(); + put_online_mems(); if (ret > 0) { /* for in-use pages */ if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a650db29606f..2906873a1502 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -46,19 +46,84 @@ static void generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; +static DEFINE_MUTEX(online_page_callback_lock); -DEFINE_MUTEX(mem_hotplug_mutex); +/* The same as the cpu_hotplug lock, but for memory hotplug. */ +static struct { + struct task_struct *active_writer; + struct mutex lock; /* Synchronizes accesses to refcount, */ + /* + * Also blocks the new readers during + * an ongoing mem hotplug operation. + */ + int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} mem_hotplug = { + .active_writer = NULL, + .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), + .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "mem_hotplug.lock" }, +#endif +}; + +/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ +#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) +#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) +#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) + +void get_online_mems(void) +{ + might_sleep(); + if (mem_hotplug.active_writer == current) + return; + memhp_lock_acquire_read(); + mutex_lock(&mem_hotplug.lock); + mem_hotplug.refcount++; + mutex_unlock(&mem_hotplug.lock); + +} -void lock_memory_hotplug(void) +void put_online_mems(void) { - mutex_lock(&mem_hotplug_mutex); + if (mem_hotplug.active_writer == current) + return; + mutex_lock(&mem_hotplug.lock); + + if (WARN_ON(!mem_hotplug.refcount)) + mem_hotplug.refcount++; /* try to fix things up */ + + if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) + wake_up_process(mem_hotplug.active_writer); + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); + } -void unlock_memory_hotplug(void) +static void mem_hotplug_begin(void) { - mutex_unlock(&mem_hotplug_mutex); + mem_hotplug.active_writer = current; + + memhp_lock_acquire(); + for (;;) { + mutex_lock(&mem_hotplug.lock); + if (likely(!mem_hotplug.refcount)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&mem_hotplug.lock); + schedule(); + } } +static void mem_hotplug_done(void) +{ + mem_hotplug.active_writer = NULL; + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); +} /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) @@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; - lock_memory_hotplug(); + get_online_mems(); + mutex_lock(&online_page_callback_lock); if (online_page_callback == generic_online_page) { online_page_callback = callback; rc = 0; } - unlock_memory_hotplug(); + mutex_unlock(&online_page_callback_lock); + put_online_mems(); return rc; } @@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; - lock_memory_hotplug(); + get_online_mems(); + mutex_lock(&online_page_callback_lock); if (online_page_callback == callback) { online_page_callback = generic_online_page; rc = 0; } - unlock_memory_hotplug(); + mutex_unlock(&online_page_callback_lock); + put_online_mems(); return rc; } @@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int ret; struct memory_notify arg; - lock_memory_hotplug(); + mem_hotplug_begin(); /* * This doesn't need a lock to do pfn_to_page(). * The section can't be removed here because of the @@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); + ret = -EINVAL; if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && - !can_online_high_movable(zone)) { - unlock_memory_hotplug(); - return -EINVAL; - } + !can_online_high_movable(zone)) + goto out; if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { - if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { - unlock_memory_hotplug(); - return -EINVAL; - } + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) + goto out; } if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { - if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { - unlock_memory_hotplug(); - return -EINVAL; - } + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) + goto out; } /* Previous code may changed the zone of the pfn range */ @@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); - unlock_memory_hotplug(); - return ret; + goto out; } /* * If this zone is not populated, then it is not in zonelist. @@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); - unlock_memory_hotplug(); - return ret; + goto out; } zone->present_pages += onlined_pages; @@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); - unlock_memory_hotplug(); - - return 0; +out: + mem_hotplug_done(); + return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1055,7 +1117,7 @@ int try_online_node(int nid) if (node_online(nid)) return 0; - lock_memory_hotplug(); + mem_hotplug_begin(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); @@ -1073,7 +1135,7 @@ int try_online_node(int nid) } out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } @@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size) new_pgdat = !p; } - lock_memory_hotplug(); + mem_hotplug_begin(); new_node = !node_online(nid); if (new_node) { @@ -1158,7 +1220,7 @@ error: release_memory_resource(res); out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - lock_memory_hotplug(); + mem_hotplug_begin(); zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); @@ -1672,7 +1734,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - unlock_memory_hotplug(); + mem_hotplug_done(); return 0; failed_removal: @@ -1684,7 +1746,7 @@ failed_removal: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } @@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) BUG_ON(check_hotplug_memory_range(start, size)); - lock_memory_hotplug(); + mem_hotplug_begin(); /* * All memory blocks must be offlined before removing memory. Check @@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) */ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, check_memblock_offlined_cb); - if (ret) { - unlock_memory_hotplug(); + if (ret) BUG(); - } /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) try_offline_node(nid); - unlock_memory_hotplug(); + mem_hotplug_done(); } EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/slub.c b/mm/slub.c index ddb60795f373..9cb2501a2960 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4332,7 +4332,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - lock_memory_hotplug(); + get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { @@ -4372,7 +4372,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - unlock_memory_hotplug(); + put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 7901cb749e17..fbcf46076c4f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3434,7 +3434,7 @@ int kswapd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold lock_memory_hotplug(). + * hold mem_hotplug_begin/end(). */ void kswapd_stop(int nid) { -- cgit v1.2.3 From 03afc0e25f7fc03537014a770f4c54ebbe63a24c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:20 -0700 Subject: slab: get_online_mems for kmem_cache_{create,destroy,shrink} When we create a sl[au]b cache, we allocate kmem_cache_node structures for each online NUMA node. To handle nodes taken online/offline, we register memory hotplug notifier and allocate/free kmem_cache_node corresponding to the node that changes its state for each kmem cache. To synchronize between the two paths we hold the slab_mutex during both the cache creationg/destruction path and while tuning per-node parts of kmem caches in memory hotplug handler, but that's not quite right, because it does not guarantee that a newly created cache will have all kmem_cache_nodes initialized in case it races with memory hotplug. For instance, in case of slub: CPU0 CPU1 ---- ---- kmem_cache_create: online_pages: __kmem_cache_create: slab_memory_callback: slab_mem_going_online_callback: lock slab_mutex for each slab_caches list entry allocate kmem_cache node unlock slab_mutex lock slab_mutex init_kmem_cache_nodes: for_each_node_state(node, N_NORMAL_MEMORY) allocate kmem_cache node add kmem_cache to slab_caches list unlock slab_mutex online_pages (continued): node_states_set_node As a result we'll get a kmem cache with not all kmem_cache_nodes allocated. To avoid issues like that we should hold get/put_online_mems() during the whole kmem cache creation/destruction/shrink paths, just like we deal with cpu hotplug. This patch does the trick. Note, that after it's applied, there is no need in taking the slab_mutex for kmem_cache_shrink any more, so it is removed from there. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tang Chen Cc: Zhang Yanfei Cc: Toshi Kani Cc: Xishi Qiu Cc: Jiang Liu Cc: Rafael J. Wysocki Cc: David Rientjes Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 26 ++------------------------ mm/slab.h | 1 + mm/slab_common.c | 35 +++++++++++++++++++++++++++++++++-- mm/slob.c | 3 +-- mm/slub.c | 5 ++--- 5 files changed, 39 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 944ac58cfcf8..7067ea7f3927 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2480,8 +2480,7 @@ out: return nr_freed; } -/* Called with slab_mutex held to protect against cpu hotplug */ -static int __cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; struct kmem_cache_node *n; @@ -2502,32 +2501,11 @@ static int __cache_shrink(struct kmem_cache *cachep) return (ret ? 1 : 0); } -/** - * kmem_cache_shrink - Shrink a cache. - * @cachep: The cache to shrink. - * - * Releases as many slabs as possible for a cache. - * To help debugging, a zero exit status indicates all slabs were released. - */ -int kmem_cache_shrink(struct kmem_cache *cachep) -{ - int ret; - BUG_ON(!cachep || in_interrupt()); - - get_online_cpus(); - mutex_lock(&slab_mutex); - ret = __cache_shrink(cachep); - mutex_unlock(&slab_mutex); - put_online_cpus(); - return ret; -} -EXPORT_SYMBOL(kmem_cache_shrink); - int __kmem_cache_shutdown(struct kmem_cache *cachep) { int i; struct kmem_cache_node *n; - int rc = __cache_shrink(cachep); + int rc = __kmem_cache_shrink(cachep); if (rc) return rc; diff --git a/mm/slab.h b/mm/slab.h index 863e67b8c8c9..d85d59803d5f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 2834bc2886fd..2dd920dc3776 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -205,6 +205,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, int err; get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); @@ -239,6 +241,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, out_unlock: mutex_unlock(&slab_mutex); + + put_online_mems(); put_online_cpus(); if (err) { @@ -272,6 +276,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c char *cache_name; get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); /* @@ -295,6 +301,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c out_unlock: mutex_unlock(&slab_mutex); + + put_online_mems(); put_online_cpus(); } @@ -328,6 +336,8 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); s->refcount--; @@ -359,15 +369,36 @@ void kmem_cache_destroy(struct kmem_cache *s) #else slab_kmem_cache_release(s); #endif - goto out_put_cpus; + goto out; out_unlock: mutex_unlock(&slab_mutex); -out_put_cpus: +out: + put_online_mems(); put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink(struct kmem_cache *cachep) +{ + int ret; + + get_online_cpus(); + get_online_mems(); + ret = __kmem_cache_shrink(cachep); + put_online_mems(); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL(kmem_cache_shrink); + int slab_is_available(void) { return slab_state >= UP; diff --git a/mm/slob.c b/mm/slob.c index 730cad45d4be..21980e0f39a8 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c) return 0; } -int kmem_cache_shrink(struct kmem_cache *d) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", diff --git a/mm/slub.c b/mm/slub.c index 9cb2501a2960..5d1b653183ab 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3398,7 +3398,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3454,7 +3454,6 @@ int kmem_cache_shrink(struct kmem_cache *s) kfree(slabs_by_inuse); return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); static int slab_mem_going_offline_callback(void *arg) { @@ -3462,7 +3461,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - kmem_cache_shrink(s); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; -- cgit v1.2.3 From 5bcc9f86ef09a933255ee66bd899d4601785dad5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:07:22 -0700 Subject: mm/page_alloc: prevent MIGRATE_RESERVE pages from being misplaced For the MIGRATE_RESERVE pages, it is useful when they do not get misplaced on free_list of other migratetype, otherwise they might get allocated prematurely and e.g. fragment the MIGRATE_RESEVE pageblocks. While this cannot be avoided completely when allocating new MIGRATE_RESERVE pageblocks in min_free_kbytes sysctl handler, we should prevent the misplacement where possible. Currently, it is possible for the misplacement to happen when a MIGRATE_RESERVE page is allocated on pcplist through rmqueue_bulk() as a fallback for other desired migratetype, and then later freed back through free_pcppages_bulk() without being actually used. This happens because free_pcppages_bulk() uses get_freepage_migratetype() to choose the free_list, and rmqueue_bulk() calls set_freepage_migratetype() with the *desired* migratetype and not the page's original MIGRATE_RESERVE migratetype. This patch fixes the problem by moving the call to set_freepage_migratetype() from rmqueue_bulk() down to __rmqueue_smallest() and __rmqueue_fallback() where the actual page's migratetype (e.g. from which free_list the page is taken from) is used. Note that this migratetype might be different from the pageblock's migratetype due to freepage stealing decisions. This is OK, as page stealing never uses MIGRATE_RESERVE as a fallback, and also takes care to leave all MIGRATE_CMA pages on the correct freelist. Therefore, as an additional benefit, the call to get_pageblock_migratetype() from rmqueue_bulk() when CMA is enabled, can be removed completely. This relies on the fact that MIGRATE_CMA pageblocks are created only during system init, and the above. The related is_migrate_isolate() check is also unnecessary, as memory isolation has other ways to move pages between freelists, and drain pcp lists containing pages that should be isolated. The buffered_rmqueue() can also benefit from calling get_freepage_migratetype() instead of get_pageblock_migratetype(). Signed-off-by: Vlastimil Babka Reported-by: Yong-Taek Lee Reported-by: Bartlomiej Zolnierkiewicz Suggested-by: Joonsoo Kim Acked-by: Joonsoo Kim Suggested-by: Mel Gorman Acked-by: Minchan Kim Cc: KOSAKI Motohiro Cc: Marek Szyprowski Cc: Hugh Dickins Cc: Rik van Riel Cc: Michal Nazarewicz Cc: "Wang, Yalin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f13bcfb6762..ab46f7945098 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -931,6 +931,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); return page; } @@ -1057,7 +1058,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. + * buddy pages to CMA itself. We also ensure the freepage_migratetype + * is set to CMA so it is returned to the correct freelist in case + * the page ends up being not actually allocated from the pcp lists. */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1125,6 +1128,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) expand(zone, page, order, current_order, area, new_type); + /* The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages. This is OK as long as it does + * not differ for MIGRATE_CMA type. + */ + set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype, new_type); @@ -1175,7 +1184,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int mt = migratetype, i; + int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1196,14 +1205,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - if (IS_ENABLED(CONFIG_CMA)) { - mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) - mt = migratetype; - } - set_freepage_migratetype(page, mt); list = &page->lru; - if (is_migrate_cma(mt)) + if (is_migrate_cma(get_freepage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1572,7 +1575,7 @@ again: if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); + get_freepage_migratetype(page)); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); -- cgit v1.2.3 From 13fb44e4b0414d7e718433a49e6430d5b76bd46e Mon Sep 17 00:00:00 2001 From: Heesub Shin Date: Wed, 4 Jun 2014 16:07:24 -0700 Subject: mm/compaction: clean up unused code lines Remove code lines currently not in use or never called. Signed-off-by: Heesub Shin Acked-by: Vlastimil Babka Cc: Dongjun Shin Cc: Sunghwan Yun Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: Dongjun Shin Cc: Sunghwan Yun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 627dc2e4320f..95f7531458f7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -208,12 +208,6 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } -static inline bool compact_trylock_irqsave(spinlock_t *lock, - unsigned long *flags, struct compact_control *cc) -{ - return compact_checklock_irqsave(lock, flags, false, cc); -} - /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { @@ -736,7 +730,6 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ - isolated = 0; /* * Take care when isolating in last pageblock of a zone which @@ -1165,9 +1158,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (zone_watermark_ok(zone, cc->order, low_wmark_pages(zone), 0, 0)) compaction_defer_reset(zone, cc->order, false); - /* Currently async compaction is never deferred. */ - else if (cc->sync) - defer_compaction(zone, cc->order); } VM_BUG_ON(!list_empty(&cc->freepages)); -- cgit v1.2.3 From c96b9e508f3d06ddb601dcc9792d62c044ab359e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:07:26 -0700 Subject: mm/compaction: cleanup isolate_freepages() isolate_freepages() is currently somewhat hard to follow thanks to many looks like it is related to the 'low_pfn' variable, but in fact it is not. This patch renames the 'high_pfn' variable to a hopefully less confusing name, and slightly changes its handling without a functional change. A comment made obsolete by recent changes is also updated. [akpm@linux-foundation.org: comment fixes, per Minchan] [iamjoonsoo.kim@lge.com: cleanups] Signed-off-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: Dongjun Shin Cc: Sunghwan Yun Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 56 +++++++++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 95f7531458f7..6010aabde28c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -665,7 +665,10 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn; + unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long block_end_pfn; /* end of current pageblock */ + unsigned long low_pfn; /* lowest pfn scanner is able to scan */ + unsigned long next_free_pfn; /* start pfn for scaning at next round */ int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -673,32 +676,33 @@ static void isolate_freepages(struct zone *zone, * Initialise the free scanner. The starting point is where we last * successfully isolated from, zone-cached value, or the end of the * zone when isolating for the first time. We need this aligned to - * the pageblock boundary, because we do pfn -= pageblock_nr_pages - * in the for loop. + * the pageblock boundary, because we do + * block_start_pfn -= pageblock_nr_pages in the for loop. + * For ending point, take care when isolating in last pageblock of a + * a zone which ends in the middle of a pageblock. * The low boundary is the end of the pageblock the migration scanner * is using. */ - pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_end_pfn = min(block_start_pfn + pageblock_nr_pages, + zone_end_pfn(zone)); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. + * If no pages are isolated, the block_start_pfn < low_pfn check + * will kick in. */ - high_pfn = min(low_pfn, pfn); - - z_end_pfn = zone_end_pfn(zone); + next_free_pfn = 0; /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { + for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages) { unsigned long isolated; - unsigned long end_pfn; /* * This can iterate a massively long zone without finding any @@ -707,7 +711,7 @@ static void isolate_freepages(struct zone *zone, */ cond_resched(); - if (!pfn_valid(pfn)) + if (!pfn_valid(block_start_pfn)) continue; /* @@ -717,7 +721,7 @@ static void isolate_freepages(struct zone *zone, * i.e. it's possible that all pages within a zones range of * pages do not belong to a single zone. */ - page = pfn_to_page(pfn); + page = pfn_to_page(block_start_pfn); if (page_zone(page) != zone) continue; @@ -730,14 +734,8 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ - - /* - * Take care when isolating in last pageblock of a zone which - * ends in the middle of a pageblock. - */ - end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); - isolated = isolate_freepages_block(cc, pfn, end_pfn, - freelist, false); + isolated = isolate_freepages_block(cc, block_start_pfn, + block_end_pfn, freelist, false); nr_freepages += isolated; /* @@ -745,9 +743,9 @@ static void isolate_freepages(struct zone *zone, * looking for free pages, the search will restart here as * page migration may have returned some pages to the allocator */ - if (isolated) { + if (isolated && next_free_pfn == 0) { cc->finished_update_free = true; - high_pfn = max(high_pfn, pfn); + next_free_pfn = block_start_pfn; } } @@ -758,10 +756,10 @@ static void isolate_freepages(struct zone *zone, * If we crossed the migrate scanner, we want to keep it that way * so that compact_finished() may detect this */ - if (pfn < low_pfn) - cc->free_pfn = max(pfn, zone->zone_start_pfn); - else - cc->free_pfn = high_pfn; + if (block_start_pfn < low_pfn) + next_free_pfn = cc->migrate_pfn; + + cc->free_pfn = next_free_pfn; cc->nr_freepages = nr_freepages; } -- cgit v1.2.3 From 613813e8985bb76bd27937bfa54faf9e9be95a52 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:07:27 -0700 Subject: mm: debug: make bad_range() output more usable and readable Nobody outputs memory addresses in decimal. PFNs are essentially addresses, and they're gibberish in decimal. Output them in hex. Also, add the nid and zone name to give a little more context to the message. Signed-off-by: Dave Hansen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ab46f7945098..132c337dbe55 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) } while (zone_span_seqretry(zone, seq)); if (ret) - pr_err("page %lu outside zone [ %lu - %lu ]\n", - pfn, start_pfn, start_pfn + sp); + pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", + pfn, zone_to_nid(zone), zone->name, + start_pfn, start_pfn + sp); return ret; } -- cgit v1.2.3 From 2329d3751b082b4fd354f334a88662d72abac52d Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:07:31 -0700 Subject: mm/swap.c: clean up *lru_cache_add* functions In mm/swap.c, __lru_cache_add() is exported, but actually there are no users outside this file. This patch unexports __lru_cache_add(), and makes it static. It also exports lru_cache_add_file(), as it is use by cifs and fuse, which can loaded as modules. Signed-off-by: Jianyu Zhan Cc: Minchan Kim Cc: Johannes Weiner Cc: Shaohua Li Cc: Bob Liu Cc: Seth Jennings Cc: Joonsoo Kim Cc: Rafael Aquini Cc: Mel Gorman Acked-by: Rik van Riel Cc: Andrea Arcangeli Cc: Khalid Aziz Cc: Christoph Hellwig Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 9ce43ba4498b..c0ed4d65438f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -582,13 +582,7 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -/* - * Queue the page for addition to the LRU via pagevec. The decision on whether - * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of __lru_cache_add() - * have the page added to the active list using mark_page_accessed(). - */ -void __lru_cache_add(struct page *page) +static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); @@ -598,11 +592,32 @@ void __lru_cache_add(struct page *page) pagevec_add(pvec, page); put_cpu_var(lru_add_pvec); } -EXPORT_SYMBOL(__lru_cache_add); + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void lru_cache_add_anon(struct page *page) +{ + ClearPageActive(page); + __lru_cache_add(page); +} + +void lru_cache_add_file(struct page *page) +{ + ClearPageActive(page); + __lru_cache_add(page); +} +EXPORT_SYMBOL(lru_cache_add_file); /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. + * + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of lru_cache_add() + * have the page added to the active list using mark_page_accessed(). */ void lru_cache_add(struct page *page) { -- cgit v1.2.3 From 64ac4940d557df8caab602eaea679ec7eaf9a57f Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 4 Jun 2014 16:07:33 -0700 Subject: mm/mmap.c: remove the first mapping check Remove the first mapping check for vma_link. Move the mutex_lock into the braces when vma->vm_file is true. Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 6cdec3a6f4bf..8a56d39df4ed 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -640,11 +640,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { struct address_space *mapping = NULL; - if (vma->vm_file) + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - - if (mapping) mutex_lock(&mapping->i_mmap_mutex); + } __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); -- cgit v1.2.3 From 675becce15f320337499bc1a9356260409a5ba29 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:35 -0700 Subject: mm: vmscan: do not throttle based on pfmemalloc reserves if node has no ZONE_NORMAL throttle_direct_reclaim() is meant to trigger during swap-over-network during which the min watermark is treated as a pfmemalloc reserve. It throttes on the first node in the zonelist but this is flawed. The user-visible impact is that a process running on CPU whose local memory node has no ZONE_NORMAL will stall for prolonged periods of time, possibly indefintely. This is due to throttle_direct_reclaim thinking the pfmemalloc reserves are depleted when in fact they don't exist on that node. On a NUMA machine running a 32-bit kernel (I know) allocation requests from CPUs on node 1 would detect no pfmemalloc reserves and the process gets throttled. This patch adjusts throttling of direct reclaim to throttle based on the first node in the zonelist that has a usable ZONE_NORMAL or lower zone. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index fbcf46076c4f..53e4534885ad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2537,10 +2537,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state(zone, NR_FREE_PAGES); } + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ @@ -2565,9 +2572,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { + struct zoneref *z; struct zone *zone; - int high_zoneidx = gfp_zone(gfp_mask); - pg_data_t *pgdat; + pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly @@ -2586,10 +2593,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (fatal_signal_pending(current)) goto out; - /* Check if the pfmemalloc reserves are ok */ - first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); - pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_mask, nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) goto out; /* Account for the throttling */ -- cgit v1.2.3 From d8dc595ce3909fbc131bdf5ab8c9808fe624b18d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 4 Jun 2014 16:07:36 -0700 Subject: memcg: do not hang on OOM when killed by userspace OOM access to memory reserves Eric has reported that he can see task(s) stuck in memcg OOM handler regularly. The only way out is to echo 0 > $GROUP/memory.oom_control His usecase is: - Setup a hierarchy with memory and the freezer (disable kernel oom and have a process watch for oom). - In that memory cgroup add a process with one thread per cpu. - In one thread slowly allocate once per second I think it is 16M of ram and mlock and dirty it (just to force the pages into ram and stay there). - When oom is achieved loop: * attempt to freeze all of the tasks. * if frozen send every task SIGKILL, unfreeze, remove the directory in cgroupfs. Eric has then pinpointed the issue to be memcg specific. All tasks are sitting on the memcg_oom_waitq when memcg oom is disabled. Those that have received fatal signal will bypass the charge and should continue on their way out. The tricky part is that the exit path might trigger a page fault (e.g. exit_robust_list), thus the memcg charge, while its memcg is still under OOM because nobody has released any charges yet. Unlike with the in-kernel OOM handler the exiting task doesn't get TIF_MEMDIE set so it doesn't shortcut further charges of the killed task and falls to the memcg OOM again without any way out of it as there are no fatal signals pending anymore. This patch fixes the issue by checking PF_EXITING early in mem_cgroup_try_charge and bypass the charge same as if it had fatal signal pending or TIF_MEMDIE set. Normally exiting tasks (aka not killed) will bypass the charge now but this should be OK as the task is leaving and will release memory and increasing the memory pressure just to release it in a moment seems dubious wasting of cycles. Besides that charges after exit_signals should be rare. I am bringing this patch again (rebased on the current mmotm tree). I hope we can move forward finally. If there is still an opposition then I would really appreciate a concurrent approach so that we can discuss alternatives. http://comments.gmane.org/gmane.linux.kernel.stable/77650 is a reference to the followup discussion when the patch has been dropped from the mmotm last time. Reported-by: Eric W. Biederman Signed-off-by: Michal Hocko Acked-by: David Rientjes Acked-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c1b816f61536..9f4ff49c6add 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2684,7 +2684,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, * free their memory. */ if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current))) + fatal_signal_pending(current) || + current->flags & PF_EXITING)) goto bypass; if (unlikely(task_in_memcg_oom(current))) -- cgit v1.2.3 From 1e32e77f95d60b121b6072e3e3a650a7f93068f9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:37 -0700 Subject: memcg, slab: do not schedule cache destruction when last page goes away This patchset is a part of preparations for kmemcg re-parenting. It targets at simplifying kmemcg work-flows and synchronization. First, it removes async per memcg cache destruction (see patches 1, 2). Now caches are only destroyed on memcg offline. That means the caches that are not empty on memcg offline will be leaked. However, they are already leaked, because memcg_cache_params::nr_pages normally never drops to 0 so the destruction work is never scheduled except kmem_cache_shrink is called explicitly. In the future I'm planning reaping such dead caches on vmpressure or periodically. Second, it substitutes per memcg slab_caches_mutex's with the global memcg_slab_mutex, which should be taken during the whole per memcg cache creation/destruction path before the slab_mutex (see patch 3). This greatly simplifies synchronization among various per memcg cache creation/destruction paths. I'm still not quite sure about the end picture, in particular I don't know whether we should reap dead memcgs' kmem caches periodically or try to merge them with their parents (see https://lkml.org/lkml/2014/4/20/38 for more details), but whichever way we choose, this set looks like a reasonable change to me, because it greatly simplifies kmemcg work-flows and eases further development. This patch (of 3): After a memcg is offlined, we mark its kmem caches that cannot be deleted right now due to pending objects as dead by setting the memcg_cache_params::dead flag, so that memcg_release_pages will schedule cache destruction (memcg_cache_params::destroy) as soon as the last slab of the cache is freed (memcg_cache_params::nr_pages drops to zero). I guess the idea was to destroy the caches as soon as possible, i.e. immediately after freeing the last object. However, it just doesn't work that way, because kmem caches always preserve some pages for the sake of performance, so that nr_pages never gets to zero unless the cache is shrunk explicitly using kmem_cache_shrink. Of course, we could account the total number of objects on the cache or check if all the slabs allocated for the cache are empty on kmem_cache_free and schedule destruction if so, but that would be too costly. Thus we have a piece of code that works only when we explicitly call kmem_cache_shrink, but complicates the whole picture a lot. Moreover, it's racy in fact. For instance, kmem_cache_shrink may free the last slab and thus schedule cache destruction before it finishes checking that the cache is empty, which can lead to use-after-free. So I propose to remove this async cache destruction from memcg_release_pages, and check if the cache is empty explicitly after calling kmem_cache_shrink instead. This will simplify things a lot w/o introducing any functional changes. And regarding dead memcg caches (i.e. those that are left hanging around after memcg offline for they have objects), I suppose we should reap them either periodically or on vmpressure as Glauber suggested initially. I'm going to implement this later. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 63 ++------------------------------------------------------- mm/slab.h | 7 ++----- 2 files changed, 4 insertions(+), 66 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9f4ff49c6add..6b1c45ced733 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3277,60 +3277,11 @@ static void kmem_cache_destroy_work_func(struct work_struct *w) cachep = memcg_params_to_cache(p); - /* - * If we get down to 0 after shrink, we could delete right away. - * However, memcg_release_pages() already puts us back in the workqueue - * in that case. If we proceed deleting, we'll get a dangling - * reference, and removing the object from the workqueue in that case - * is unnecessary complication. We are not a fast path. - * - * Note that this case is fundamentally different from racing with - * shrink_slab(): if memcg_cgroup_destroy_cache() is called in - * kmem_cache_shrink, not only we would be reinserting a dead cache - * into the queue, but doing so from inside the worker racing to - * destroy it. - * - * So if we aren't down to zero, we'll just schedule a worker and try - * again - */ - if (atomic_read(&cachep->memcg_params->nr_pages) != 0) - kmem_cache_shrink(cachep); - else + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) kmem_cache_destroy(cachep); } -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ - if (!cachep->memcg_params->dead) - return; - - /* - * There are many ways in which we can get here. - * - * We can get to a memory-pressure situation while the delayed work is - * still pending to run. The vmscan shrinkers can then release all - * cache memory and get us to destruction. If this is the case, we'll - * be executed twice, which is a bug (the second time will execute over - * bogus data). In this case, cancelling the work should be fine. - * - * But we can also get here from the worker itself, if - * kmem_cache_shrink is enough to shake all the remaining objects and - * get the page count to 0. In this case, we'll deadlock if we try to - * cancel the work (the worker runs with an internal lock held, which - * is the same lock we would hold for cancel_work_sync().) - * - * Since we can't possibly know who got us here, just refrain from - * running if there is already work pending - */ - if (work_pending(&cachep->memcg_params->destroy)) - return; - /* - * We have to defer the actual destroying to a workqueue, because - * we might currently be in a context that cannot sleep. - */ - schedule_work(&cachep->memcg_params->destroy); -} - int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; @@ -3356,16 +3307,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) * We will now manually delete the caches, so to avoid races * we need to cancel all pending destruction workers and * proceed with destruction ourselves. - * - * kmem_cache_destroy() will call kmem_cache_shrink internally, - * and that could spawn the workers again: it is likely that - * the cache still have active pages until this very moment. - * This would lead us back to mem_cgroup_destroy_cache. - * - * But that will not execute at all if the "dead" flag is not - * set, so flip it down to guarantee we are in control. */ - c->memcg_params->dead = false; cancel_work_sync(&c->memcg_params->destroy); kmem_cache_destroy(c); @@ -3387,7 +3329,6 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) mutex_lock(&memcg->slab_caches_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - cachep->memcg_params->dead = true; schedule_work(&cachep->memcg_params->destroy); } mutex_unlock(&memcg->slab_caches_mutex); diff --git a/mm/slab.h b/mm/slab.h index d85d59803d5f..b59447ac4533 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -129,11 +129,8 @@ static inline void memcg_bind_pages(struct kmem_cache *s, int order) static inline void memcg_release_pages(struct kmem_cache *s, int order) { - if (is_root_cache(s)) - return; - - if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) - mem_cgroup_destroy_cache(s); + if (!is_root_cache(s)) + atomic_sub(1 << order, &s->memcg_params->nr_pages); } static inline bool slab_equal_or_root(struct kmem_cache *s, -- cgit v1.2.3 From c67a8a685a6e9abbaf0235e084168f15a721ae39 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:39 -0700 Subject: memcg, slab: merge memcg_{bind,release}_pages to memcg_{un}charge_slab Currently we have two pairs of kmemcg-related functions that are called on slab alloc/free. The first is memcg_{bind,release}_pages that count the total number of pages allocated on a kmem cache. The second is memcg_{un}charge_slab that {un}charge slab pages to kmemcg resource counter. Let's just merge them to keep the code clean. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 22 ++++++++++++++++++++-- mm/slab.c | 2 -- mm/slab.h | 25 ++----------------------- mm/slub.c | 2 -- 4 files changed, 22 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b1c45ced733..86a2078805e5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2954,7 +2954,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) } #endif -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; int ret = 0; @@ -2992,7 +2992,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) return ret; } -void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { res_counter_uncharge(&memcg->res, size); if (do_swap_account) @@ -3390,6 +3390,24 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, __memcg_create_cache_enqueue(memcg, cachep); memcg_resume_kmem_account(); } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ + int res; + + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, + PAGE_SIZE << order); + if (!res) + atomic_add(1 << order, &cachep->memcg_params->nr_pages); + return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ + memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); + atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} + /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. diff --git a/mm/slab.c b/mm/slab.c index 7067ea7f3927..9ca3b87edabc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1712,7 +1712,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, __SetPageSlab(page); if (page->pfmemalloc) SetPageSlabPfmemalloc(page); - memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1748,7 +1747,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) page_mapcount_reset(page); page->mapping = NULL; - memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; __free_pages(page, cachep->gfporder); diff --git a/mm/slab.h b/mm/slab.h index b59447ac4533..961a3fb1f5a2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -121,18 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return !s->memcg_params || s->memcg_params->is_root_cache; } -static inline void memcg_bind_pages(struct kmem_cache *s, int order) -{ - if (!is_root_cache(s)) - atomic_add(1 << order, &s->memcg_params->nr_pages); -} - -static inline void memcg_release_pages(struct kmem_cache *s, int order) -{ - if (!is_root_cache(s)) - atomic_sub(1 << order, &s->memcg_params->nr_pages); -} - static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { @@ -198,8 +186,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params->memcg, gfp, - PAGE_SIZE << order); + return __memcg_charge_slab(s, gfp, order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -208,7 +195,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order); + __memcg_uncharge_slab(s, order); } #else static inline bool is_root_cache(struct kmem_cache *s) @@ -216,14 +203,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return true; } -static inline void memcg_bind_pages(struct kmem_cache *s, int order) -{ -} - -static inline void memcg_release_pages(struct kmem_cache *s, int order) -{ -} - static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { diff --git a/mm/slub.c b/mm/slub.c index 5d1b653183ab..9e288d7c5e6a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1422,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) @@ -1473,7 +1472,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - memcg_release_pages(s, order); page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; -- cgit v1.2.3 From bd67314586a3d5725e60f2f6587b4cb0f659bb67 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:40 -0700 Subject: memcg, slab: simplify synchronization scheme At present, we have the following mutexes protecting data related to per memcg kmem caches: - slab_mutex. This one is held during the whole kmem cache creation and destruction paths. We also take it when updating per root cache memcg_caches arrays (see memcg_update_all_caches). As a result, taking it guarantees there will be no changes to any kmem cache (including per memcg). Why do we need something else then? The point is it is private to slab implementation and has some internal dependencies with other mutexes (get_online_cpus). So we just don't want to rely upon it and prefer to introduce additional mutexes instead. - activate_kmem_mutex. Initially it was added to synchronize initializing kmem limit (memcg_activate_kmem). However, since we can grow per root cache memcg_caches arrays only on kmem limit initialization (see memcg_update_all_caches), we also employ it to protect against memcg_caches arrays relocation (e.g. see __kmem_cache_destroy_memcg_children). - We have a convention not to take slab_mutex in memcontrol.c, but we want to walk over per memcg memcg_slab_caches lists there (e.g. for destroying all memcg caches on offline). So we have per memcg slab_caches_mutex's protecting those lists. The mutexes are taken in the following order: activate_kmem_mutex -> slab_mutex -> memcg::slab_caches_mutex Such a syncrhonization scheme has a number of flaws, for instance: - We can't call kmem_cache_{destroy,shrink} while walking over a memcg::memcg_slab_caches list due to locking order. As a result, in mem_cgroup_destroy_all_caches we schedule the memcg_cache_params::destroy work shrinking and destroying the cache. - We don't have a mutex to synchronize per memcg caches destruction between memcg offline (mem_cgroup_destroy_all_caches) and root cache destruction (__kmem_cache_destroy_memcg_children). Currently we just don't bother about it. This patch simplifies it by substituting per memcg slab_caches_mutex's with the global memcg_slab_mutex. It will be held whenever a new per memcg cache is created or destroyed, so it protects per root cache memcg_caches arrays and per memcg memcg_slab_caches lists. The locking order is following: activate_kmem_mutex -> memcg_slab_mutex -> slab_mutex This allows us to call kmem_cache_{create,shrink,destroy} under the memcg_slab_mutex. As a result, we don't need memcg_cache_params::destroy work any more - we can simply destroy caches while iterating over a per memcg slab caches list. Also using the global mutex simplifies synchronization between concurrent per memcg caches creation/destruction, e.g. mem_cgroup_destroy_all_caches vs __kmem_cache_destroy_memcg_children. The downside of this is that we substitute per-memcg slab_caches_mutex's with a hummer-like global mutex, but since we already take either the slab_mutex or the cgroup_mutex along with a memcg::slab_caches_mutex, it shouldn't hurt concurrency a lot. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 150 ++++++++++++++++++++++--------------------------------- mm/slab_common.c | 23 +++------ 2 files changed, 67 insertions(+), 106 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 86a2078805e5..6b448881422b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -357,10 +357,9 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list. per-memcg */ + /* analogous to slab_common's slab_caches list, but per-memcg; + * protected by memcg_slab_mutex */ struct list_head memcg_slab_caches; - /* Not a spinlock, we can take a lot of time walking the list */ - struct mutex slab_caches_mutex; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -2913,6 +2912,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, static DEFINE_MUTEX(set_limit_mutex); #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); + static DEFINE_MUTEX(activate_kmem_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) @@ -2945,10 +2950,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) print_slabinfo_header(m); - mutex_lock(&memcg->slab_caches_mutex); + mutex_lock(&memcg_slab_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); return 0; } @@ -3050,8 +3055,6 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } -static void kmem_cache_destroy_work_func(struct work_struct *w); - int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3148,8 +3151,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3166,24 +3167,34 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -void memcg_register_cache(struct kmem_cache *s) +static void memcg_kmem_create_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *root; - struct mem_cgroup *memcg; + struct kmem_cache *cachep; int id; - if (is_root_cache(s)) + lockdep_assert_held(&memcg_slab_mutex); + + id = memcg_cache_id(memcg); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, id)) return; + cachep = kmem_cache_create_memcg(memcg, root_cache); /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. */ - lockdep_assert_held(&slab_mutex); + if (!cachep) + return; - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; - id = memcg_cache_id(memcg); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3192,49 +3203,30 @@ void memcg_register_cache(struct kmem_cache *s) */ smp_wmb(); - /* - * Initialize the pointer to this cache in its parent's memcg_params - * before adding it to the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = s; - - mutex_lock(&memcg->slab_caches_mutex); - list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id]); + root_cache->memcg_params->memcg_caches[id] = cachep; } -void memcg_unregister_cache(struct kmem_cache *s) +static void memcg_kmem_destroy_cache(struct kmem_cache *cachep) { - struct kmem_cache *root; + struct kmem_cache *root_cache; struct mem_cgroup *memcg; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); - /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. - */ - lockdep_assert_held(&slab_mutex); + BUG_ON(is_root_cache(cachep)); - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; + root_cache = cachep->memcg_params->root_cache; + memcg = cachep->memcg_params->memcg; id = memcg_cache_id(memcg); - mutex_lock(&memcg->slab_caches_mutex); - list_del(&s->memcg_params->list); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; - /* - * Clear the pointer to this cache in its parent's memcg_params only - * after removing it from the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); - root->memcg_params->memcg_caches[id] = NULL; + list_del(&cachep->memcg_params->list); + + kmem_cache_destroy(cachep); } /* @@ -3268,70 +3260,42 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *p; - - p = container_of(w, struct memcg_cache_params, destroy); - - cachep = memcg_params_to_cache(p); - - kmem_cache_shrink(cachep); - if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - kmem_cache_destroy(cachep); -} - int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; int i, failed = 0; - /* - * If the cache is being destroyed, we trust that there is no one else - * requesting objects from it. Even if there are, the sanity checks in - * kmem_cache_destroy should caught this ill-case. - * - * Still, we don't want anyone else freeing memcg_caches under our - * noses, which can happen if a new memcg comes to life. As usual, - * we'll take the activate_kmem_mutex to protect ourselves against - * this. - */ - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_slab_mutex); for_each_memcg_cache_index(i) { c = cache_from_memcg_idx(s, i); if (!c) continue; - /* - * We will now manually delete the caches, so to avoid races - * we need to cancel all pending destruction workers and - * proceed with destruction ourselves. - */ - cancel_work_sync(&c->memcg_params->destroy); - kmem_cache_destroy(c); + memcg_kmem_destroy_cache(c); if (cache_from_memcg_idx(s, i)) failed++; } - mutex_unlock(&activate_kmem_mutex); + mutex_unlock(&memcg_slab_mutex); return failed; } static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; - struct memcg_cache_params *params; + struct memcg_cache_params *params, *tmp; if (!memcg_kmem_is_active(memcg)) return; - mutex_lock(&memcg->slab_caches_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + mutex_lock(&memcg_slab_mutex); + list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - schedule_work(&cachep->memcg_params->destroy); + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + memcg_kmem_destroy_cache(cachep); } - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); } struct create_work { @@ -3346,7 +3310,10 @@ static void memcg_create_cache_work_func(struct work_struct *w) struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - kmem_cache_create_memcg(memcg, cachep); + mutex_lock(&memcg_slab_mutex); + memcg_kmem_create_cache(memcg, cachep); + mutex_unlock(&memcg_slab_mutex); + css_put(&memcg->css); kfree(cw); } @@ -5022,13 +4989,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * Make sure we have enough space for this cgroup in each root cache's * memcg_params. */ + mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(memcg_id + 1); + mutex_unlock(&memcg_slab_mutex); if (err) goto out_rmid; memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); - mutex_init(&memcg->slab_caches_mutex); /* * We couldn't have accounted to this cgroup, because it hasn't got the diff --git a/mm/slab_common.c b/mm/slab_common.c index 2dd920dc3776..7e348cff814d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -160,7 +160,6 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, s->refcount = 1; list_add(&s->list, &slab_caches); - memcg_register_cache(s); out: if (err) return ERR_PTR(err); @@ -270,9 +269,10 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) +struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *s; + struct kmem_cache *s = NULL; char *cache_name; get_online_cpus(); @@ -280,14 +280,6 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c mutex_lock(&slab_mutex); - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can try to - * create the same cache, but only one of them may succeed. - */ - if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) - goto out_unlock; - cache_name = memcg_create_cache_name(memcg, root_cache); if (!cache_name) goto out_unlock; @@ -296,14 +288,18 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); - if (IS_ERR(s)) + if (IS_ERR(s)) { kfree(cache_name); + s = NULL; + } out_unlock: mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); + + return s; } static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) @@ -348,11 +344,8 @@ void kmem_cache_destroy(struct kmem_cache *s) goto out_unlock; list_del(&s->list); - memcg_unregister_cache(s); - if (__kmem_cache_shutdown(s) != 0) { list_add(&s->list, &slab_caches); - memcg_register_cache(s); printk(KERN_ERR "kmem_cache_destroy %s: " "Slab cache still has objects\n", s->name); dump_stack(); -- cgit v1.2.3 From 11de9927f9dd3cb0a0f18064fa4b6976fc37e79c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:41 -0700 Subject: mm: numa: add migrated transhuge pages to LRU the same way as base pages Migration of misplaced transhuge pages uses page_add_new_anon_rmap() when putting the page back as it avoided an atomic operations and added the new page to the correct LRU. A side-effect is that the page gets marked activated as part of the migration meaning that transhuge and base pages are treated differently from an aging perspective than base page migration. This patch uses page_add_anon_rmap() and putback_lru_page() on completion of a transhuge migration similar to base page migration. It would require fewer atomic operations to use lru_cache_add without taking an additional reference to the page. The downside would be that it's still different to base page migration and unevictable pages may be added to the wrong LRU for cleaning up later. Testing of the usual workloads did not show any adverse impact to the change. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Sasha Levin Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index bed48809e5d0..6247be7fa30e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1852,7 +1852,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_new_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start); pmdp_clear_flush(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1877,6 +1877,10 @@ fail_putback: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* Take an "isolate" reference and put new page on the LRU. */ + get_page(new_page); + putback_lru_page(new_page); + unlock_page(new_page); unlock_page(page); put_page(page); /* Drop the rmap reference */ -- cgit v1.2.3 From 399ba0b95670c70aaaa3f4f1623ea9e76c391681 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 4 Jun 2014 16:07:42 -0700 Subject: mm/vmscan.c: avoid throttling reclaim for loop-back nfsd threads When a loopback NFS mount is active and the backing device for the NFS mount becomes congested, that can impose throttling delays on the nfsd threads. These delays significantly reduce throughput and so the NFS mount remains congested. This results in a livelock and the reduced throughput persists. This livelock has been found in testing with the 'wait_iff_congested' call, and could possibly be caused by the 'congestion_wait' call. This livelock is similar to the deadlock which justified the introduction of PF_LESS_THROTTLE, and the same flag can be used to remove this livelock. To minimise the impact of the change, we still throttle nfsd when the filesystem it is writing to is congested, but not when some separate filesystem (e.g. the NFS filesystem) is congested. Signed-off-by: NeilBrown Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 53e4534885ad..5a8776eb0f43 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1438,6 +1438,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) list_splice(&pages_to_free, page_list); } +/* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); +} + /* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages @@ -1566,7 +1579,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * implies that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (nr_unqueued_dirty == nr_taken || nr_immediate) + if ((nr_unqueued_dirty == nr_taken || nr_immediate) && + current_may_throttle()) congestion_wait(BLK_RW_ASYNC, HZ/10); } @@ -1575,7 +1589,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * is congested. Allow kswapd to continue until it starts encountering * unqueued dirty pages or cycling through the LRU too quickly. */ - if (!sc->hibernation_mode && !current_is_kswapd()) + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, -- cgit v1.2.3 From 57d998456ae8680ed446aa1993f45f4d8a9a5973 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:45 -0700 Subject: fs/mpage.c: factor page_endio() out of mpage_end_io() page_endio() takes care of updating all the appropriate page flags once I/O has finished to a page. Switch to using mapping_set_error() instead of setting AS_EIO directly; this will handle thin-provisioned devices correctly. Signed-off-by: Matthew Wilcox Cc: Dave Chinner Cc: Dheeraj Reddy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 021056c324e6..47d235b357a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -764,6 +764,31 @@ void end_page_writeback(struct page *page) } EXPORT_SYMBOL(end_page_writeback); +/* + * After completing I/O on a page, call this routine to update the page + * flags appropriately + */ +void page_endio(struct page *page, int rw, int err) +{ + if (rw == READ) { + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* rw == WRITE */ + if (err) { + SetPageError(page); + if (page->mapping) + mapping_set_error(page->mapping, err); + } + end_page_writeback(page); + } +} +EXPORT_SYMBOL_GPL(page_endio); + /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock -- cgit v1.2.3 From dd6bd0d9c7dbb3951005db4e424abbd37c4ff801 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:48 -0700 Subject: swap: use bdev_read_page() / bdev_write_page() By calling the device driver to write the page directly, we avoid allocating a BIO, which allows us to free memory without allocating memory. [akpm@linux-foundation.org: fix used-uninitialized bug] Signed-off-by: Matthew Wilcox Cc: Dave Chinner Cc: Dheeraj Reddy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 7c59ef681381..58b50d2901fe 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -248,11 +248,16 @@ out: return ret; } +static sector_t swap_page_sector(struct page *page) +{ + return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)) { struct bio *bio; - int ret = 0, rw = WRITE; + int ret, rw = WRITE; struct swap_info_struct *sis = page_swap_info(page); if (sis->flags & SWP_FILE) { @@ -297,6 +302,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return ret; } + ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); + if (!ret) { + count_vm_event(PSWPOUT); + return 0; + } + + ret = 0; bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); @@ -338,6 +350,13 @@ int swap_readpage(struct page *page) return ret; } + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); + if (!ret) { + count_vm_event(PSWPIN); + return 0; + } + + ret = 0; bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); -- cgit v1.2.3 From c8e861a531b0199dc6ef9e402e29c474dfa507ce Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:07:51 -0700 Subject: mm/memory_hotplug.c: use PFN_DOWN() Replace ((x) >> PAGE_SHIFT) with the pfn macro. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2906873a1502..cbb7ca0ac44b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1069,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) struct pglist_data *pgdat; unsigned long zones_size[MAX_NR_ZONES] = {0}; unsigned long zholes_size[MAX_NR_ZONES] = {0}; - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -1141,7 +1141,7 @@ out: static int check_hotplug_memory_range(u64 start, u64 size) { - u64 start_pfn = start >> PAGE_SHIFT; + u64 start_pfn = PFN_DOWN(start); u64 nr_pages = size >> PAGE_SHIFT; /* Memory range must be aligned with section */ -- cgit v1.2.3 From f7e2f7e896d8b74e92b687f7333721fd7be0f4b5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:07:51 -0700 Subject: mm/memblock.c: use PFN_DOWN Replace ((x) >> PAGE_SHIFT) with the pfn macro. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 146736411318..0aa0d2b07624 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1402,9 +1402,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, if (mid == -1) return -1; - *start_pfn = type->regions[mid].base >> PAGE_SHIFT; - *end_pfn = (type->regions[mid].base + type->regions[mid].size) - >> PAGE_SHIFT; + *start_pfn = PFN_DOWN(type->regions[mid].base); + *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); return type->regions[mid].nid; } -- cgit v1.2.3 From 7c8e0181e6e0b8079c4c2ce902bf52d7a2c6fa5d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 4 Jun 2014 16:07:56 -0700 Subject: mm: replace __get_cpu_var uses with this_cpu_ptr Replace places where __get_cpu_var() is used for an address calculation with this_cpu_ptr(). Signed-off-by: Christoph Lameter Cc: Tejun Heo Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- mm/memory-failure.c | 2 +- mm/page-writeback.c | 4 ++-- mm/slub.c | 6 +++--- mm/swap.c | 2 +- mm/vmalloc.c | 2 +- mm/vmstat.c | 4 ++-- mm/zsmalloc.c | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b448881422b..14326935800d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2436,7 +2436,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6917f799412b..d50f17fb9be2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1298,7 +1298,7 @@ static void memory_failure_work_func(struct work_struct *work) unsigned long proc_flags; int gotten; - mf_cpu = &__get_cpu_var(memory_failure_cpu); + mf_cpu = this_cpu_ptr(&memory_failure_cpu); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a4317da60532..b9b8e8204628 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1623,7 +1623,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ - p = &__get_cpu_var(bdp_ratelimits); + p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { @@ -1635,7 +1635,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ - p = &__get_cpu_var(dirty_throttle_leaks); + p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); diff --git a/mm/slub.c b/mm/slub.c index 9e288d7c5e6a..fdf0fe4da9a9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2209,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, page = new_slab(s, flags, node); if (page) { - c = __this_cpu_ptr(s->cpu_slab); + c = raw_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); @@ -2425,7 +2425,7 @@ redo: * and the retrieval of the tid. */ preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = this_cpu_ptr(s->cpu_slab); /* * The transaction ids are globally unique per cpu and per operation on @@ -2681,7 +2681,7 @@ redo: * during the cmpxchg then the free will succedd. */ preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = this_cpu_ptr(s->cpu_slab); tid = c->tid; preempt_enable(); diff --git a/mm/swap.c b/mm/swap.c index c0ed4d65438f..913b99dfbea5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -441,7 +441,7 @@ void rotate_reclaimable_page(struct page *page) page_cache_get(page); local_irq_save(flags); - pvec = &__get_cpu_var(lru_rotate_pvecs); + pvec = this_cpu_ptr(&lru_rotate_pvecs); if (!pagevec_add(pvec, page)) pagevec_move_tail(pvec); local_irq_restore(flags); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bf233b283319..ddaf70b21b59 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1496,7 +1496,7 @@ void vfree(const void *addr) if (!addr) return; if (unlikely(in_interrupt())) { - struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); + struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } else diff --git a/mm/vmstat.c b/mm/vmstat.c index 82ce17ce58c4..376bd2d21482 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -489,7 +489,7 @@ static void refresh_cpu_vm_stats(void) continue; if (__this_cpu_read(p->pcp.count)) - drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); #endif } fold_diff(global_diff); @@ -1230,7 +1230,7 @@ int sysctl_stat_interval __read_mostly = HZ; static void vmstat_update(struct work_struct *w) { refresh_cpu_vm_stats(); - schedule_delayed_work(&__get_cpu_var(vmstat_work), + schedule_delayed_work(this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 36b4591a7a2d..5ae5d85b629d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1082,7 +1082,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) class = &pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); - area = &__get_cpu_var(zs_map_area); + area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) kunmap_atomic(area->vm_addr); else { -- cgit v1.2.3 From f7f28ca98b9a7a99fc55df2dddcf49857ab004f0 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 4 Jun 2014 16:07:57 -0700 Subject: mm: constify nmask argument to mbind() The nmask argument to mbind() is const according to the userspace header numaif.h, and since the kernel does indeed not modify it, it might as well be declared const in the kernel. Signed-off-by: Rasmus Villemoes Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78e1472933ea..727187f1155b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1362,7 +1362,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, - unsigned long, mode, unsigned long __user *, nmask, + unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; -- cgit v1.2.3 From 23c8902d403ef9a04cdc367d0b76a3ed6d83f5c5 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 4 Jun 2014 16:07:58 -0700 Subject: mm: constify nmask argument to set_mempolicy() The nmask argument to set_mempolicy() is const according to the user-space header numaif.h, and since the kernel does indeed not modify it, it might as well be declared const in the kernel. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 727187f1155b..b09586d8316b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1383,7 +1383,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, } /* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { int err; -- cgit v1.2.3 From c747ce7907ab11be53d65ef55c53821558720d8f Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:07:59 -0700 Subject: mm/swap.c: introduce put_[un]refcounted_compound_page helpers for splitting put_compound_page() Currently, put_compound_page() carefully handles tricky cases to avoid racing with compound page releasing or splitting, which makes it quite lenthy (about 200+ lines) and needs deep tab indention, which makes it quite hard to follow and maintain. This patch and the next patch refactor this function. Based on the code skeleton of put_compound_page: put_compound_pge: if !PageTail(page) put head page fastpath; return; /* else PageTail */ page_head = compound_head(page) if !__compound_tail_refcounted(page_head) put head page optimal path; <---(1) return; else put head page slowpath; <--- (2) return; This patch introduces two helpers, put_[un]refcounted_compound_page, handling the code path (1) and code path (2), respectively. They both are tagged __always_inline, thus elmiating function call overhead, making them operating the same way as before. They are almost copied verbatim(except one place, a "goto out_put_single" is expanded), with some comments rephrasing. Signed-off-by: Jianyu Zhan Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: Jiang Liu Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Sasha Levin Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 913b99dfbea5..54f3ae4aaf41 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -79,6 +79,148 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } +/** + * Two special cases here: we could avoid taking compound_lock_irqsave + * and could skip the tail refcounting(in _mapcount). + * + * 1. Hugetlbfs page: + * + * PageHeadHuge will remain true until the compound page + * is released and enters the buddy allocator, and it could + * not be split by __split_huge_page_refcount(). + * + * So if we see PageHeadHuge set, and we have the tail page pin, + * then we could safely put head page. + * + * 2. Slab THP page: + * + * PG_slab is cleared before the slab frees the head page, and + * tail pin cannot be the last reference left on the head page, + * because the slab code is free to reuse the compound page + * after a kfree/kmem_cache_free without having to check if + * there's any tail pin left. In turn all tail pinsmust be always + * released while the head is still pinned by the slab code + * and so we know PG_slab will be still set too. + * + * So if we see PageSlab set, and we have the tail page pin, + * then we could safely put head page. + */ +static __always_inline +void put_unrefcounted_compound_page(struct page *page_head, struct page *page) +{ + /* + * If @page is a THP tail, we must read the tail page + * flags after the head page flags. The + * __split_huge_page_refcount side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount cannot race + * here, see the comment above this function. + */ + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a slab THP page, + * the tail pin must not be the last reference + * held on the page, because the PG_slab cannot + * be cleared before all tail pins (which skips + * the _mapcount tail refcounting) have been + * released. + * + * If this is the tail of a hugetlbfs page, + * the tail pin may be the last reference on + * the page instead, because PageHeadHuge will + * not go away until the compound page enters + * the buddy allocator. + */ + VM_BUG_ON_PAGE(PageSlab(page_head), page_head); + __put_compound_page(page_head); + } + } else + /* + * __split_huge_page_refcount run before us, + * @page was a THP tail. The split @page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). + */ + if (put_page_testzero(page)) + __put_single_page(page); +} + +static __always_inline +void put_refcounted_compound_page(struct page *page_head, struct page *page) +{ + if (likely(page != page_head && get_page_unless_zero(page_head))) { + unsigned long flags; + + /* + * @page_head wasn't a dangling pointer but it may not + * be a head page anymore by the time we obtain the + * lock. That is ok as long as it can't be freed from + * under us. + */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) { + /* + * The @page_head may have been freed + * and reallocated as a compound page + * of smaller order and then freed + * again. All we know is that it + * cannot have become: a THP page, a + * compound page of higher order, a + * tail page. That is because we + * still hold the refcount of the + * split THP tail and page_head was + * the THP head before the split. + */ + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } +out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON_PAGE(page_head != page->first_page, page); + /* + * We can release the refcount taken by + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON_PAGE(1, page_head); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); + atomic_dec(&page->_mapcount); + VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); + VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); + compound_unlock_irqrestore(page_head, flags); + + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* @page_head is a dangling pointer */ + VM_BUG_ON_PAGE(PageTail(page), page); + goto out_put_single; + } +} + static void put_compound_page(struct page *page) { struct page *page_head; -- cgit v1.2.3 From 4bd3e8f7b94785a6f65665fee21ff3dbc2bf4ef8 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:08:01 -0700 Subject: mm/swap.c: split put_compound_page() Currently, put_compound_page() carefully handles tricky cases to avoid racing with compound page releasing or splitting, which makes it quite lenthy (about 200+ lines) and needs deep tab indention, which makes it quite hard to follow and maintain. Now based on two helpers introduced in the previous patch ("mm/swap.c: introduce put_[un]refcounted_compound_page helpers for spliting put_compound_page"), this patch replaces those two lengthy code paths with these two helpers, respectively. Also, it has some comment rephrasing. After this patch, the put_compound_page() is very compact, thus easy to read and maintain. After splitting, the object file is of same size as the original one. Actually, I've diff'ed put_compound_page()'s orginal disassemble code and the patched disassemble code, the are 100% the same! This fact shows that this splitting has no functional change, but it brings readability. This patch and the previous one blow the code by 32 lines, mostly due to comments. Signed-off-by: Jianyu Zhan Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: Jiang Liu Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Sasha Levin Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 142 +++++++------------------------------------------------------- 1 file changed, 16 insertions(+), 126 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 54f3ae4aaf41..d089c5a0cf98 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -225,6 +225,11 @@ static void put_compound_page(struct page *page) { struct page *page_head; + /* + * We see the PageCompound set and PageTail not set, so @page maybe: + * 1. hugetlbfs head page, or + * 2. THP head page. + */ if (likely(!PageTail(page))) { if (put_page_testzero(page)) { /* @@ -239,135 +244,20 @@ static void put_compound_page(struct page *page) return; } - /* __split_huge_page_refcount can run under us */ - page_head = compound_head(page); - /* - * THP can not break up slab pages so avoid taking - * compound_lock() and skip the tail page refcounting (in - * _mapcount) too. Slab performs non-atomic bit ops on - * page->flags for better performance. In particular - * slab_unlock() in slub used to be a hot path. It is still - * hot on arches that do not support - * this_cpu_cmpxchg_double(). + * We see the PageCompound set and PageTail set, so @page maybe: + * 1. a tail hugetlbfs page, or + * 2. a tail THP page, or + * 3. a split THP page. * - * If "page" is part of a slab or hugetlbfs page it cannot be - * splitted and the head page cannot change from under us. And - * if "page" is part of a THP page under splitting, if the - * head page pointed by the THP tail isn't a THP head anymore, - * we'll find PageTail clear after smp_rmb() and we'll treat - * it as a single page. + * Case 3 is possible, as we may race with + * __split_huge_page_refcount tearing down a THP page. */ - if (!__compound_tail_refcounted(page_head)) { - /* - * If "page" is a THP tail, we must read the tail page - * flags after the head page flags. The - * split_huge_page side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. - */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab - * compound page, the tail pin must - * not be the last reference held on - * the page, because the PG_slab - * cannot be cleared before all tail - * pins (which skips the _mapcount - * tail refcounting) have been - * released. For hugetlbfs the tail - * pin may be the last reference on - * the page instead, because - * PageHeadHuge will not go away until - * the compound page enters the buddy - * allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - return; - } else - /* - * __split_huge_page_refcount run before us, - * "page" was a THP tail. The split page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). - */ - goto out_put_single; - } - - if (likely(page != page_head && get_page_unless_zero(page_head))) { - unsigned long flags; - - /* - * page_head wasn't a dangling pointer but it may not - * be a head page anymore by the time we obtain the - * lock. That is ok as long as it can't be freed from - * under us. - */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The head page may have been freed - * and reallocated as a compound page - * of smaller order and then freed - * again. All we know is that it - * cannot have become: a THP page, a - * compound page of higher order, a - * tail page. That is because we - * still hold the refcount of the - * split THP tail and page_head was - * the THP head before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; - } - VM_BUG_ON_PAGE(page_head != page->first_page, page); - /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on the - * compound_lock. - */ - if (put_page_testzero(page_head)) - VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); - atomic_dec(&page->_mapcount); - VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - compound_unlock_irqrestore(page_head, flags); - - if (put_page_testzero(page_head)) { - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } - } else { - /* page_head is a dangling pointer */ - VM_BUG_ON_PAGE(PageTail(page), page); - goto out_put_single; - } + page_head = compound_head(page); + if (!__compound_tail_refcounted(page_head)) + put_unrefcounted_compound_page(page_head, page); + else + put_refcounted_compound_page(page_head, page); } void put_page(struct page *page) -- cgit v1.2.3 From d2ee40eae98d8a41ff27dcdd13b1b656c4c1ad00 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:08:02 -0700 Subject: mm: introdule compound_head_by_tail() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, in put_compound_page(), we have ====== if (likely(!PageTail(page))) { <------ (1) if (put_page_testzero(page)) { /* ¦* By the time all refcounts have been released ¦* split_huge_page cannot run anymore from under us. ¦*/ if (PageHead(page)) __put_compound_page(page); else __put_single_page(page); } return; } /* __split_huge_page_refcount can run under us */ page_head = compound_head(page); <------------ (2) ====== if at (1) , we fail the check, this means page is *likely* a tail page. Then at (2), as compoud_head(page) is inlined, it is : ====== static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) { <----------- (3) struct page *head = page->first_page; smp_rmb(); if (likely(PageTail(page))) return head; } return page; } ====== here, the (3) unlikely in the case is a negative hint, because it is *likely* a tail page. So the check (3) in this case is not good, so I introduce a helper for this case. So this patch introduces compound_head_by_tail() which deals with a possible tail page(though it could be spilt by a racy thread), and make compound_head() a wrapper on it. This patch has no functional change, and it reduces the object size slightly: text data bss dec hex filename 11003 1328 16 12347 303b mm/swap.o.orig 10971 1328 16 12315 301b mm/swap.o.patched I've ran "perf top -e branch-miss" to observe branch-miss in this case. As Michael points out, it's a slow path, so only very few times this case happens. But I grep'ed the code base, and found there still are some other call sites could be benifited from this helper. And given that it only bloating up the source by only 5 lines, but with a reduced object size. I still believe this helper deserves to exsit. Signed-off-by: Jianyu Zhan Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: Jiang Liu Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Sasha Levin Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index d089c5a0cf98..c8d6df556ce6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -253,7 +253,7 @@ static void put_compound_page(struct page *page) * Case 3 is possible, as we may race with * __split_huge_page_refcount tearing down a THP page. */ - page_head = compound_head(page); + page_head = compound_head_by_tail(page); if (!__compound_tail_refcounted(page_head)) put_unrefcounted_compound_page(page_head, page); else -- cgit v1.2.3 From cc6b664aa26de93d9a3f99d4021a8d88b434ed06 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Wed, 4 Jun 2014 16:08:05 -0700 Subject: mm/dmapool.c: remove redundant NULL check for dev in dma_pool_create() "dev" cannot be NULL because it is already checked before calling dma_pool_create(). If dev ever was NULL, the code would oops in dev_to_node() after enabling CONFIG_NUMA. It is possible that some driver is using dev==NULL and has never been run on a NUMA machine. Such a driver is probably outdated, possibly buggy and will need some attention if it starts triggering NULL derefs. Signed-off-by: Daeseok Youn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index 8058fcd7ae91..a3a1bfe91110 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->boundary = boundary; retval->allocation = allocation; - if (dev) { - int ret; + INIT_LIST_HEAD(&retval->pools); - mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools)) - ret = device_create_file(dev, &dev_attr_pools); - else - ret = 0; - /* note: not currently insisting "name" be unique */ - if (!ret) - list_add(&retval->pools, &dev->dma_pools); - else { - kfree(retval); - retval = NULL; - } - mutex_unlock(&pools_lock); + mutex_lock(&pools_lock); + if (list_empty(&dev->dma_pools) && + device_create_file(dev, &dev_attr_pools)) { + kfree(retval); + return NULL; } else - INIT_LIST_HEAD(&retval->pools); + list_add(&retval->pools, &dev->dma_pools); + mutex_unlock(&pools_lock); return retval; } -- cgit v1.2.3 From 7fe7047597cf5ebb300802494db4f407327ec94f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:08:06 -0700 Subject: mm: shrinker trace points: fix negatives I was looking at a trace of the slab shrinkers (attachment in this comment): https://bugs.freedesktop.org/show_bug.cgi?id=72742#c67 and noticed that "total_scan" can go negative in some cases. We used to dump out the "total_scan" variable directly, but some of the shrinker modifications along the way changed that. This patch just dumps it out directly, again. It doesn't make any sense to derive it from new_nr and nr any more since there are now other shrinkers that can be running in parallel and mucking with those values. Here's an example of the negative numbers in the output: > kswapd0-840 [000] 160.869398: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 10 new scan count 39 total_scan 29 last shrinker return val 256 > kswapd0-840 [000] 160.869618: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 39 new scan count 102 total_scan 63 last shrinker return val 256 > kswapd0-840 [000] 160.870031: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 102 new scan count 47 total_scan -55 last shrinker return val 768 > kswapd0-840 [000] 160.870464: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 47 new scan count 45 total_scan -2 last shrinker return val 768 > kswapd0-840 [000] 163.384144: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 45 new scan count 56 total_scan 11 last shrinker return val 0 > kswapd0-840 [000] 163.384297: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 56 new scan count 15 total_scan -41 last shrinker return val 256 > kswapd0-840 [000] 163.384414: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 15 new scan count 117 total_scan 102 last shrinker return val 0 > kswapd0-840 [000] 163.384657: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 117 new scan count 36 total_scan -81 last shrinker return val 512 > kswapd0-840 [000] 163.384880: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 36 new scan count 111 total_scan 75 last shrinker return val 256 > kswapd0-840 [000] 163.385256: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 111 new scan count 34 total_scan -77 last shrinker return val 768 > kswapd0-840 [000] 163.385598: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 34 new scan count 122 total_scan 88 last shrinker return val 512 Signed-off-by: Dave Hansen Acked-by: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a8776eb0f43..15e93158bd0b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr, total_scan); return freed; } -- cgit v1.2.3 From df9024a8c5a3e031c5df26386f74ffed1b8fc095 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:08:07 -0700 Subject: mm: shrinker: add nid to tracepoint output Now that we are doing NUMA-aware shrinking, and can have shrinkers running in parallel, or working on individual nodes, it seems like we should also be sticking the node in the output. Signed-off-by: Dave Hansen Acked-by: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 15e93158bd0b..9253e188000f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); return freed; } -- cgit v1.2.3 From ada4ba591472f511ad56dd0075c457295c3ca317 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:08:08 -0700 Subject: mm/memcontrol.c: remove NULL assignment on static static values are automatically initialized to NULL Signed-off-by: Fabian Frederick Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 14326935800d..03d76628fd9d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -80,7 +80,7 @@ int do_swap_account __read_mostly; #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; #else -static int really_do_swap_account __initdata = 0; +static int really_do_swap_account __initdata; #endif #else @@ -3110,7 +3110,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) char *memcg_create_cache_name(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - static char *buf = NULL; + static char *buf; /* * We need a mutex here to protect the shared buffer. Since this is -- cgit v1.2.3 From f4527c90868d8fa175c68ccf216cf9b67a7d8a1a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:08:09 -0700 Subject: mm/vmalloc.c: replace seq_printf by seq_puts Replace seq_printf where possible Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ddaf70b21b59..2ed75fb89fc1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2619,19 +2619,19 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); if (v->flags & VM_IOREMAP) - seq_printf(m, " ioremap"); + seq_puts(m, " ioremap"); if (v->flags & VM_ALLOC) - seq_printf(m, " vmalloc"); + seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) - seq_printf(m, " vmap"); + seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) - seq_printf(m, " user"); + seq_puts(m, " user"); if (v->flags & VM_VPAGES) - seq_printf(m, " vpages"); + seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); -- cgit v1.2.3 From 4bbd4c776a63a063546552de42f6a535395f6d9e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:10 -0700 Subject: mm: move get_user_pages()-related code to separate file mm/memory.c is overloaded: over 4k lines. get_user_pages() code is pretty much self-contained let's move it to separate file. No other changes made. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 2 +- mm/gup.c | 649 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/internal.h | 5 + mm/memory.c | 641 --------------------------------------------------------- 4 files changed, 655 insertions(+), 642 deletions(-) create mode 100644 mm/gup.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index 0173940407f6..4064f3ec145e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/gup.c b/mm/gup.c new file mode 100644 index 000000000000..ea88b65f264d --- /dev/null +++ b/mm/gup.c @@ -0,0 +1,649 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + + page = NULL; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + goto no_page_table; + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + if (flags & FOLL_GET) + goto out; + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto no_page_table; + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else { + page = NULL; + goto out; + } + } + goto out; + } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto no_page_table; + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(vma, address, pmd); + goto split_fallthrough; + } + ptl = pmd_lock(mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(vma, address, + pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + goto out; + } + } else + spin_unlock(ptl); + /* fall through */ + } +split_fallthrough: + if (unlikely(pmd_bad(*pmd))) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + + pte = *ptep; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto split_fallthrough; + } + if ((flags & FOLL_NUMA) && pte_numa(pte)) + goto no_page; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } + + if (flags & FOLL_GET) + get_page_foll(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ + mark_page_accessed(page); + } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. + */ + mlock_vma_page(page); + unlock_page(page); + } + } +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return page; + +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return page; + +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && + (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return page; +} + +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return stack_guard_page_start(vma, addr) || + stack_guard_page_end(vma, addr+PAGE_SIZE); +} + +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) +{ + long i; + unsigned long vm_flags; + unsigned int page_mask; + + if (!nr_pages) + return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + + /* + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + + i = 0; + + do { + struct vm_area_struct *vma; + + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(mm, start)) { + unsigned long pg = start & PAGE_MASK; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + goto efault; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + if (pmd_none(*pmd)) + goto efault; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, pg); + if (pte_none(*pte)) { + pte_unmap(pte); + goto efault; + } + vma = get_gate_vma(mm); + if (pages) { + struct page *page; + + page = vm_normal_page(vma, start, *pte); + if (!page) { + if (!(gup_flags & FOLL_DUMP) && + is_zero_pfn(pte_pfn(*pte))) + page = pte_page(*pte); + else { + pte_unmap(pte); + goto efault; + } + } + pages[i] = page; + get_page(page); + } + pte_unmap(pte); + page_mask = 0; + goto next_page; + } + + if (!vma) + goto efault; + vm_flags = vma->vm_flags; + if (vm_flags & (VM_IO | VM_PFNMAP)) + goto efault; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * We used to let the write,force case do COW + * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so + * ptrace could set a breakpoint in a read-only + * mapping of an executable, without corrupting + * the file (yet only when that file had been + * opened for writing!). Anon pages in shared + * mappings are surprising: now just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + goto efault; + } + } + } else { + if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * Is there actually any vma we can reach here + * which does not have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + goto efault; + } + } + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, gup_flags); + continue; + } + + do { + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + + cond_resched(); + while (!(page = follow_page_mask(vma, start, + foll_flags, &page_mask))) { + int ret; + unsigned int fault_flags = 0; + + /* For mlock, just skip the stack guard page. */ + if (foll_flags & FOLL_MLOCK) { + if (stack_guard_page(vma, start)) + goto next_page; + } + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (foll_flags & FOLL_NOWAIT) + fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); + + ret = handle_mm_fault(mm, vma, start, + fault_flags); + + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (i) + return i; + else if (gup_flags & FOLL_HWPOISON) + return -EHWPOISON; + else + return -EFAULT; + } + if (ret & VM_FAULT_SIGBUS) + goto efault; + BUG(); + } + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return i; + } + + /* + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) + foll_flags &= ~FOLL_WRITE; + + cond_resched(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + + flush_anon_page(vma, page, start); + flush_dcache_page(page); + page_mask = 0; + } +next_page: + if (vmas) { + vmas[i] = vma; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; + } while (nr_pages && start < vma->vm_end); + } while (nr_pages); + return i; +efault: + return i ? : -EFAULT; +} +EXPORT_SYMBOL(__get_user_pages); + +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + vm_flags_t vm_flags; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* + * get_user_pages() - pin user pages in memory + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) +{ + int flags = FOLL_TOUCH; + + if (pages) + flags |= FOLL_GET; + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); +} +EXPORT_SYMBOL(get_user_pages); + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ diff --git a/mm/internal.h b/mm/internal.h index 07b67361a40a..6ee580d69ddd 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -169,6 +169,11 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); diff --git a/mm/memory.c b/mm/memory.c index 0897830011f3..7049d394fa07 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -698,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } -static inline bool is_cow_mapping(vm_flags_t flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -1458,642 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - goto no_page_table; - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - goto out; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pud_bad(*pud))) - goto no_page_table; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto no_page_table; - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else { - page = NULL; - goto out; - } - } - goto out; - } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) - goto no_page_table; - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - goto split_fallthrough; - } - ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - goto out; - } - } else - spin_unlock(ptl); - /* fall through */ - } -split_fallthrough: - if (unlikely(pmd_bad(*pmd))) - goto no_page_table; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte) || pte_file(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto split_fallthrough; - } - if ((flags & FOLL_NUMA) && pte_numa(pte)) - goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); - } - - if (flags & FOLL_GET) - get_page_foll(page); - if (flags & FOLL_TOUCH) { - if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - /* - * pte_mkyoung() would be more correct here, but atomic care - * is needed to avoid losing the dirty bit: it is easier to use - * mark_page_accessed(). - */ - mark_page_accessed(page); - } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return page; - -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - -no_page: - pte_unmap_unlock(ptep, ptl); - if (!pte_none(pte)) - return page; - -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or - * page tables. Return error instead of NULL to skip handle_mm_fault, - * then get_dump_page() will return NULL to leave a hole in the dump. - * But we can only make this optimization where a hole would surely - * be zero-filled if handle_mm_fault() actually did handle it. - */ - if ((flags & FOLL_DUMP) && - (!vma->vm_ops || !vma->vm_ops->fault)) - return ERR_PTR(-EFAULT); - return page; -} - -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return stack_guard_page_start(vma, addr) || - stack_guard_page_end(vma, addr+PAGE_SIZE); -} - -/** - * __get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * @nonblocking: whether waiting for disk IO or mmap_sem contention - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * __get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * __get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If - * the page is written to, set_page_dirty (or set_page_dirty_lock, as - * appropriate) must be called after the page is finished with, and - * before put_page is called. - * - * If @nonblocking != NULL, __get_user_pages will not wait for disk IO - * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. - * - * In most cases, get_user_pages or get_user_pages_fast should be used - * instead of __get_user_pages. __get_user_pages should be used only if - * you need some special @gup_flags. - */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) -{ - long i; - unsigned long vm_flags; - unsigned int page_mask; - - if (!nr_pages) - return 0; - - VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - - /* - * If FOLL_FORCE is set then do not force a full fault as the hinting - * fault information is unrelated to the reference behaviour of a task - * using the address space - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - - i = 0; - - do { - struct vm_area_struct *vma; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(mm, start)) { - unsigned long pg = start & PAGE_MASK; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* user gate pages are read-only */ - if (gup_flags & FOLL_WRITE) - goto efault; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - if (pmd_none(*pmd)) - goto efault; - VM_BUG_ON(pmd_trans_huge(*pmd)); - pte = pte_offset_map(pmd, pg); - if (pte_none(*pte)) { - pte_unmap(pte); - goto efault; - } - vma = get_gate_vma(mm); - if (pages) { - struct page *page; - - page = vm_normal_page(vma, start, *pte); - if (!page) { - if (!(gup_flags & FOLL_DUMP) && - is_zero_pfn(pte_pfn(*pte))) - page = pte_page(*pte); - else { - pte_unmap(pte); - goto efault; - } - } - pages[i] = page; - get_page(page); - } - pte_unmap(pte); - page_mask = 0; - goto next_page; - } - - if (!vma) - goto efault; - vm_flags = vma->vm_flags; - if (vm_flags & (VM_IO | VM_PFNMAP)) - goto efault; - - if (gup_flags & FOLL_WRITE) { - if (!(vm_flags & VM_WRITE)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * We used to let the write,force case do COW - * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so - * ptrace could set a breakpoint in a read-only - * mapping of an executable, without corrupting - * the file (yet only when that file had been - * opened for writing!). Anon pages in shared - * mappings are surprising: now just reject it. - */ - if (!is_cow_mapping(vm_flags)) { - WARN_ON_ONCE(vm_flags & VM_MAYWRITE); - goto efault; - } - } - } else { - if (!(vm_flags & VM_READ)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * Is there actually any vma we can reach here - * which does not have VM_MAYREAD set? - */ - if (!(vm_flags & VM_MAYREAD)) - goto efault; - } - } - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, gup_flags); - continue; - } - - do { - struct page *page; - unsigned int foll_flags = gup_flags; - unsigned int page_increm; - - /* - * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory. - */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; - - cond_resched(); - while (!(page = follow_page_mask(vma, start, - foll_flags, &page_mask))) { - int ret; - unsigned int fault_flags = 0; - - /* For mlock, just skip the stack guard page. */ - if (foll_flags & FOLL_MLOCK) { - if (stack_guard_page(vma, start)) - goto next_page; - } - if (foll_flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; - if (foll_flags & FOLL_NOWAIT) - fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); - - ret = handle_mm_fault(mm, vma, start, - fault_flags); - - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return i ? i : -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | - VM_FAULT_HWPOISON_LARGE)) { - if (i) - return i; - else if (gup_flags & FOLL_HWPOISON) - return -EHWPOISON; - else - return -EFAULT; - } - if (ret & VM_FAULT_SIGBUS) - goto efault; - BUG(); - } - - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - - if (ret & VM_FAULT_RETRY) { - if (nonblocking) - *nonblocking = 0; - return i; - } - - /* - * The VM_FAULT_WRITE bit tells us that - * do_wp_page has broken COW when necessary, - * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && - !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; - - cond_resched(); - } - if (IS_ERR(page)) - return i ? i : PTR_ERR(page); - if (pages) { - pages[i] = page; - - flush_anon_page(vma, page, start); - flush_dcache_page(page); - page_mask = 0; - } -next_page: - if (vmas) { - vmas[i] = vma; - page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); - if (page_increm > nr_pages) - page_increm = nr_pages; - i += page_increm; - start += page_increm * PAGE_SIZE; - nr_pages -= page_increm; - } while (nr_pages && start < vma->vm_end); - } while (nr_pages); - return i; -efault: - return i ? : -EFAULT; -} -EXPORT_SYMBOL(__get_user_pages); - -/* - * fixup_user_fault() - manually resolve a user page fault - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @address: user address - * @fault_flags:flags to pass down to handle_mm_fault() - * - * This is meant to be called in the specific scenario where for locking reasons - * we try to access user memory in atomic context (within a pagefault_disable() - * section), this returns -EFAULT, and we want to resolve the user fault before - * trying again. - * - * Typically this is meant to be used by the futex code. - * - * The main difference with get_user_pages() is that this function will - * unconditionally call handle_mm_fault() which will in turn perform all the - * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. - * - * This is important for some architectures where those bits also gate the - * access permission to the page because they are maintained in software. On - * such architectures, gup() will not be enough to make a subsequent access - * succeed. - * - * This should be called with the mm_sem held for read. - */ -int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) -{ - struct vm_area_struct *vma; - vm_flags_t vm_flags; - int ret; - - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) - return -EFAULT; - - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; - if (!(vm_flags & vma->vm_flags)) - return -EFAULT; - - ret = handle_mm_fault(mm, vma, address, fault_flags); - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & VM_FAULT_SIGBUS) - return -EFAULT; - BUG(); - } - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - return 0; -} - -/* - * get_user_pages() - pin user pages in memory - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force access even when user mapping is currently - * protected (but never forces write access to shared mapping). - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. - * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. - * - * See also get_user_pages_fast, for performance critical applications. - */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) -{ - int flags = FOLL_TOUCH; - - if (pages) - flags |= FOLL_GET; - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); -} -EXPORT_SYMBOL(get_user_pages); - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by page_cache_release() or put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; -} -#endif /* CONFIG_ELF_CORE */ - pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { -- cgit v1.2.3 From f2b495ca82e188fd2818479a551f126edf023756 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:11 -0700 Subject: mm: extract in_gate_area() case from __get_user_pages() The case is special and disturb from reading main __get_user_pages() code path. Let's move it to separate function. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 90 ++++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 48 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index ea88b65f264d..0bf127b332e7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -213,6 +213,50 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add stack_guard_page_end(vma, addr+PAGE_SIZE); } +static int get_gate_page(struct mm_struct *mm, unsigned long address, + unsigned int gup_flags, struct vm_area_struct **vma, + struct page **page) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret = -EFAULT; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + return -EFAULT; + if (address > TASK_SIZE) + pgd = pgd_offset_k(address); + else + pgd = pgd_offset_gate(mm, address); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, address); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, address); + if (pte_none(*pte)) + goto unmap; + *vma = get_gate_vma(mm); + if (!page) + goto out; + *page = vm_normal_page(*vma, address, *pte); + if (!*page) { + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + goto unmap; + *page = pte_page(*pte); + } + get_page(*page); +out: + ret = 0; +unmap: + pte_unmap(pte); + return ret; +} + /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -291,49 +335,11 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { - unsigned long pg = start & PAGE_MASK; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* user gate pages are read-only */ - if (gup_flags & FOLL_WRITE) - goto efault; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - if (pmd_none(*pmd)) + int ret; + ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, + &vma, pages ? &pages[i] : NULL); + if (ret) goto efault; - VM_BUG_ON(pmd_trans_huge(*pmd)); - pte = pte_offset_map(pmd, pg); - if (pte_none(*pte)) { - pte_unmap(pte); - goto efault; - } - vma = get_gate_vma(mm); - if (pages) { - struct page *page; - - page = vm_normal_page(vma, start, *pte); - if (!page) { - if (!(gup_flags & FOLL_DUMP) && - is_zero_pfn(pte_pfn(*pte))) - page = pte_page(*pte); - else { - pte_unmap(pte); - goto efault; - } - } - pages[i] = page; - get_page(page); - } - pte_unmap(pte); page_mask = 0; goto next_page; } -- cgit v1.2.3 From 69e68b4f03135da4a09d1215a3942d7dabd1075b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:11 -0700 Subject: mm: cleanup follow_page_mask() Cleanups: - move pte-related code to separate function. It's about half of the function; - get rid of some goto-logic; - use 'return NULL' instead of 'return page' where page can only be NULL; Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 231 ++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 119 insertions(+), 112 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 0bf127b332e7..406367845ded 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -12,105 +12,35 @@ #include "internal.h" -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +static struct page *no_page_table(struct vm_area_struct *vma, + unsigned int flags) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return NULL; +} - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - goto no_page_table; - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - goto out; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pud_bad(*pud))) - goto no_page_table; +static struct page *follow_page_pte(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + spinlock_t *ptl; + pte_t *ptep, pte; - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto no_page_table; - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else { - page = NULL; - goto out; - } - } - goto out; - } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) - goto no_page_table; - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - goto split_fallthrough; - } - ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - goto out; - } - } else - spin_unlock(ptl); - /* fall through */ - } -split_fallthrough: +retry: if (unlikely(pmd_bad(*pmd))) - goto no_page_table; + return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; @@ -128,12 +58,14 @@ split_fallthrough: goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); - goto split_fallthrough; + goto retry; } if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) { + pte_unmap_unlock(ptep, ptl); + return NULL; + } page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { @@ -178,11 +110,8 @@ split_fallthrough: unlock_page(page); } } -unlock: pte_unmap_unlock(ptep, ptl); -out: return page; - bad_page: pte_unmap_unlock(ptep, ptl); return ERR_PTR(-EFAULT); @@ -190,21 +119,99 @@ bad_page: no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) + return NULL; + return no_page_table(vma, flags); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); return page; + } -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or - * page tables. Return error instead of NULL to skip handle_mm_fault, - * then get_dump_page() will return NULL to leave a hole in the dump. - * But we can only make this optimization where a hole would surely - * be zero-filled if handle_mm_fault() actually did handle it. - */ - if ((flags & FOLL_DUMP) && - (!vma->vm_ops || !vma->vm_ops->fault)) - return ERR_PTR(-EFAULT); - return page; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + if (flags & FOLL_GET) + return NULL; + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return no_page_table(vma, flags); + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else + page = NULL; + } + return page; + } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + return no_page_table(vma, flags); + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(vma, address, pmd); + return follow_page_pte(vma, address, pmd, flags); + } + ptl = pmd_lock(mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(vma, address, + pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; + } + } else + spin_unlock(ptl); + } + return follow_page_pte(vma, address, pmd, flags); } static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -- cgit v1.2.3 From 1674448345cdb56e724483a2a26622771f4e3a10 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:12 -0700 Subject: mm: extract code to fault in a page from __get_user_pages() Nesting level in __get_user_pages() is just insane. Let's try to fix it a bit. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 138 ++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 71 insertions(+), 67 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 406367845ded..28e370068ffe 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -214,12 +214,6 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return follow_page_pte(vma, address, pmd, flags); } -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return stack_guard_page_start(vma, addr) || - stack_guard_page_end(vma, addr+PAGE_SIZE); -} - static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) @@ -264,6 +258,63 @@ unmap: return ret; } +static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned int fault_flags = 0; + int ret; + + /* For mlock, just skip the stack guard page. */ + if ((*flags & FOLL_MLOCK) && + (stack_guard_page_start(vma, address) || + stack_guard_page_end(vma, address + PAGE_SIZE))) + return -ENOENT; + if (*flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (*flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return -EBUSY; + } + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when + * necessary, even if maybe_mkwrite decided not to set pte_write. We + * can thus safely do subsequent page lookups as if they were reads. + * But only do so when looping for pte_write is futile: in some cases + * userspace may also be wanting to write to the gotten user page, + * which a read fault here might prevent (a readonly page might get + * reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) + *flags &= ~FOLL_WRITE; + return 0; +} + /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -410,69 +461,22 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, while (!(page = follow_page_mask(vma, start, foll_flags, &page_mask))) { int ret; - unsigned int fault_flags = 0; - - /* For mlock, just skip the stack guard page. */ - if (foll_flags & FOLL_MLOCK) { - if (stack_guard_page(vma, start)) - goto next_page; - } - if (foll_flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; - if (foll_flags & FOLL_NOWAIT) - fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); - - ret = handle_mm_fault(mm, vma, start, - fault_flags); - - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return i ? i : -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | - VM_FAULT_HWPOISON_LARGE)) { - if (i) - return i; - else if (gup_flags & FOLL_HWPOISON) - return -EHWPOISON; - else - return -EFAULT; - } - if (ret & VM_FAULT_SIGBUS) - goto efault; - BUG(); - } - - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - - if (ret & VM_FAULT_RETRY) { - if (nonblocking) - *nonblocking = 0; + ret = faultin_page(tsk, vma, start, &foll_flags, + nonblocking); + switch (ret) { + case 0: + break; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + return i ? i : ret; + case -EBUSY: return i; + case -ENOENT: + goto next_page; + default: + BUG(); } - - /* - * The VM_FAULT_WRITE bit tells us that - * do_wp_page has broken COW when necessary, - * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && - !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; - cond_resched(); } if (IS_ERR(page)) -- cgit v1.2.3 From fa5bb2093a1d2ba552309a81139e0abebf5325d8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:13 -0700 Subject: mm: cleanup __get_user_pages() Get rid of two nested loops over nr_pages, extract vma flags checking to separate function and other random cleanups. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 218 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 107 insertions(+), 111 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 28e370068ffe..cc5a9e7adea7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -315,6 +315,44 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, return 0; } +static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) +{ + vm_flags_t vm_flags = vma->vm_flags; + + if (vm_flags & (VM_IO | VM_PFNMAP)) + return -EFAULT; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * We used to let the write,force case do COW in a + * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could + * set a breakpoint in a read-only mapping of an + * executable, without corrupting the file (yet only + * when that file had been opened for writing!). + * Anon pages in shared mappings are surprising: now + * just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + return -EFAULT; + } + } + } else if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * Is there actually any vma we can reach here which does not + * have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + return -EFAULT; + } + return 0; +} + /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -369,9 +407,9 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { - long i; - unsigned long vm_flags; + long i = 0; unsigned int page_mask; + struct vm_area_struct *vma = NULL; if (!nr_pages) return 0; @@ -386,124 +424,82 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA; - i = 0; - do { - struct vm_area_struct *vma; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(mm, start)) { - int ret; - ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, - &vma, pages ? &pages[i] : NULL); - if (ret) - goto efault; - page_mask = 0; - goto next_page; - } + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(mm, start)) { + int ret; + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, + pages ? &pages[i] : NULL); + if (ret) + return i ? : ret; + page_mask = 0; + goto next_page; + } - if (!vma) - goto efault; - vm_flags = vma->vm_flags; - if (vm_flags & (VM_IO | VM_PFNMAP)) - goto efault; - - if (gup_flags & FOLL_WRITE) { - if (!(vm_flags & VM_WRITE)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * We used to let the write,force case do COW - * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so - * ptrace could set a breakpoint in a read-only - * mapping of an executable, without corrupting - * the file (yet only when that file had been - * opened for writing!). Anon pages in shared - * mappings are surprising: now just reject it. - */ - if (!is_cow_mapping(vm_flags)) { - WARN_ON_ONCE(vm_flags & VM_MAYWRITE); - goto efault; - } + if (!vma || check_vma_flags(vma, gup_flags)) + return i ? : -EFAULT; + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, + gup_flags); + continue; } - } else { - if (!(vm_flags & VM_READ)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * Is there actually any vma we can reach here - * which does not have VM_MAYREAD set? - */ - if (!(vm_flags & VM_MAYREAD)) - goto efault; + } +retry: + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + cond_resched(); + page = follow_page_mask(vma, start, foll_flags, &page_mask); + if (!page) { + int ret; + ret = faultin_page(tsk, vma, start, &foll_flags, + nonblocking); + switch (ret) { + case 0: + goto retry; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + return i ? i : ret; + case -EBUSY: + return i; + case -ENOENT: + goto next_page; } + BUG(); } - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, gup_flags); - continue; + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + flush_anon_page(vma, page, start); + flush_dcache_page(page); + page_mask = 0; } - - do { - struct page *page; - unsigned int foll_flags = gup_flags; - unsigned int page_increm; - - /* - * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory. - */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; - - cond_resched(); - while (!(page = follow_page_mask(vma, start, - foll_flags, &page_mask))) { - int ret; - ret = faultin_page(tsk, vma, start, &foll_flags, - nonblocking); - switch (ret) { - case 0: - break; - case -EFAULT: - case -ENOMEM: - case -EHWPOISON: - return i ? i : ret; - case -EBUSY: - return i; - case -ENOENT: - goto next_page; - default: - BUG(); - } - cond_resched(); - } - if (IS_ERR(page)) - return i ? i : PTR_ERR(page); - if (pages) { - pages[i] = page; - - flush_anon_page(vma, page, start); - flush_dcache_page(page); - page_mask = 0; - } next_page: - if (vmas) { - vmas[i] = vma; - page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); - if (page_increm > nr_pages) - page_increm = nr_pages; - i += page_increm; - start += page_increm * PAGE_SIZE; - nr_pages -= page_increm; - } while (nr_pages && start < vma->vm_end); + if (vmas) { + vmas[i] = vma; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; } while (nr_pages); return i; -efault: - return i ? : -EFAULT; } EXPORT_SYMBOL(__get_user_pages); -- cgit v1.2.3 From ac7695012a6f3269acd80d6c2b2218a6769edbf3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:17 -0700 Subject: mm/rmap.c: make page_referenced_one() and try_to_unmap_one() static KSM was converted to use rmap_walk() and now nobody uses these functions outside mm/rmap.c. Let's covert them back to static. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 7da400d5d98e..8754e1fa83b6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -671,7 +671,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1114,7 +1114,7 @@ out: /* * @arg: enum ttu_flags will be passed to this argument */ -int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; -- cgit v1.2.3 From b46e14acb816038bda92f6aa0dd2c4554fe64d24 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:08:18 -0700 Subject: mm/mempolicy.c: parameter doc uniformization Also fixes kernel-doc warning Signed-off-by: Fabian Frederick Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b09586d8316b..7f7864b95e8e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1606,9 +1606,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, /* * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma - virtual memory area whose policy is sought - * @addr - address in @vma for shared policy lookup + * @task: task for fallback if vma policy == default + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. @@ -1854,11 +1854,11 @@ int node_random(const nodemask_t *maskp) #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) - * @vma = virtual memory area whose policy is sought - * @addr = address in @vma for shared policy lookup and interleave policy - * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted mempolicy - * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a zonelist suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. @@ -2270,9 +2270,9 @@ static void sp_free(struct sp_node *n) /** * mpol_misplaced - check whether current page node is valid in policy * - * @page - page to be checked - * @vma - vm area where page mapped - * @addr - virtual address where page mapped + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's * node id. -- cgit v1.2.3 From bdcbb659fe630fc64f6604e99a180bb2ccc630c2 Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Wed, 4 Jun 2014 16:08:21 -0700 Subject: memcg: fold mem_cgroup_stolen It is only used in __mem_cgroup_begin_update_page_stat(), the name is confusing and 2 routines for one thing also confuse people, so fold this function seems more clear. [akpm@linux-foundation.org: fix typo, per Michal] Signed-off-by: Qiang Huang Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 03d76628fd9d..4a9dfc83643d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1594,23 +1594,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) } /* - * 2 routines for checking "mem" is under move_account() or not. + * A routine for checking "mem" is under move_account() or not. * - * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This - * is used for avoiding races in accounting. If true, - * pc->mem_cgroup may be overwritten. - * - * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or - * under hierarchy of moving cgroups. This is for - * waiting at hith-memory prressure caused by "move". + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move". */ - -static bool mem_cgroup_stolen(struct mem_cgroup *memcg) -{ - VM_BUG_ON(!rcu_read_lock_held()); - return atomic_read(&memcg->moving_account) > 0; -} - static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; @@ -1653,7 +1642,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) * Take this lock when * - a code tries to modify page's memcg while it's USED. * - a code tries to modify page state accounting in a memcg. - * see mem_cgroup_stolen(), too. */ static void move_lock_mem_cgroup(struct mem_cgroup *memcg, unsigned long *flags) @@ -2326,9 +2314,10 @@ again: * If this memory cgroup is not under account moving, we don't * need to take move_lock_mem_cgroup(). Because we already hold * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock() if mem_cgroup_stolen() == true. + * rcu_read_unlock(). */ - if (!mem_cgroup_stolen(memcg)) + VM_BUG_ON(!rcu_read_lock_held()); + if (atomic_read(&memcg->moving_account) <= 0) return; move_lock_mem_cgroup(memcg, flags); -- cgit v1.2.3 From b5ffc8560cf758422e85b786cca32cd7e1513a7f Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Wed, 4 Jun 2014 16:08:22 -0700 Subject: memcg: correct comments for __mem_cgroup_begin_update_page_stat Signed-off-by: Qiang Huang Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4a9dfc83643d..971d7b643f6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2276,12 +2276,11 @@ cleanup: } /* - * Currently used to update mapped file statistics, but the routine can be - * generalized to update other statistics as well. + * Used to update mapped file or writeback or other statistics. * * Notes: Race condition * - * We usually use page_cgroup_lock() for accessing page_cgroup member but + * We usually use lock_page_cgroup() for accessing page_cgroup member but * it tends to be costly. But considering some conditions, we doesn't need * to do so _always_. * @@ -2295,8 +2294,8 @@ cleanup: * by flags. * * Considering "move", this is an only case we see a race. To make the race - * small, we check mm->moving_account and detect there are possibility of race - * If there is, we take a lock. + * small, we check memcg->moving_account and detect there are possibility + * of race or not. If there is, we take a lock. */ void __mem_cgroup_begin_update_page_stat(struct page *page, -- cgit v1.2.3 From 073ee1c6cd11cd190f4d0da84d9b4ba79d7b9e70 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:08:23 -0700 Subject: memcg: get rid of memcg_create_cache_name Instead of calling back to memcontrol.c from kmem_cache_create_memcg in order to just create the name of a per memcg cache, let's allocate it in place. We only need to pass the memcg name to kmem_cache_create_memcg for that - everything else can be done in slab_common.c. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 33 +++++++++------------------------ mm/slab_common.c | 7 +++++-- 2 files changed, 14 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 971d7b643f6e..7df7f599e3df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3095,29 +3095,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } -char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - static char *buf; - - /* - * We need a mutex here to protect the shared buffer. Since this is - * expected to be called only on cache creation, we can employ the - * slab_mutex for that purpose. - */ - lockdep_assert_held(&slab_mutex); - - if (!buf) { - buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!buf) - return NULL; - } - - cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); - return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), buf); -} - int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@ -3158,6 +3135,7 @@ void memcg_free_cache_params(struct kmem_cache *s) static void memcg_kmem_create_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { + static char *memcg_name_buf; /* protected by memcg_slab_mutex */ struct kmem_cache *cachep; int id; @@ -3173,7 +3151,14 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, if (cache_from_memcg_idx(root_cache, id)) return; - cachep = kmem_cache_create_memcg(memcg, root_cache); + if (!memcg_name_buf) { + memcg_name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!memcg_name_buf) + return; + } + + cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); + cachep = kmem_cache_create_memcg(memcg, root_cache, memcg_name_buf); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root diff --git a/mm/slab_common.c b/mm/slab_common.c index 7e348cff814d..32175617cb75 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -264,13 +264,15 @@ EXPORT_SYMBOL(kmem_cache_create); * kmem_cache_create_memcg - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. + * @memcg_name: The name of the memory cgroup (used for naming the new cache). * * This function attempts to create a kmem cache that will serve allocation * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) + struct kmem_cache *root_cache, + const char *memcg_name) { struct kmem_cache *s = NULL; char *cache_name; @@ -280,7 +282,8 @@ struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); - cache_name = memcg_create_cache_name(memcg, root_cache); + cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + memcg_cache_id(memcg), memcg_name); if (!cache_name) goto out_unlock; -- cgit v1.2.3 From 93f39eea9c229778361ae7ecf5f5e95d291757da Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:08:24 -0700 Subject: memcg: memcg_kmem_create_cache: make memcg_name_buf statically allocated It isn't worth complicating the code by allocating it on the first access, because it only takes 256 bytes. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7df7f599e3df..5e2bfcc96da9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3135,7 +3135,8 @@ void memcg_free_cache_params(struct kmem_cache *s) static void memcg_kmem_create_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - static char *memcg_name_buf; /* protected by memcg_slab_mutex */ + static char memcg_name_buf[NAME_MAX + 1]; /* protected by + memcg_slab_mutex */ struct kmem_cache *cachep; int id; @@ -3151,12 +3152,6 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, if (cache_from_memcg_idx(root_cache, id)) return; - if (!memcg_name_buf) { - memcg_name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!memcg_name_buf) - return; - } - cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); cachep = kmem_cache_create_memcg(memcg, root_cache, memcg_name_buf); /* -- cgit v1.2.3 From 68711a746345c44ae00c64d8dbac6a9ce13ac54a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:25 -0700 Subject: mm, migration: add destination page freeing callback Memory migration uses a callback defined by the caller to determine how to allocate destination pages. When migration fails for a source page, however, it frees the destination page back to the system. This patch adds a memory migration callback defined by the caller to determine how to free destination pages. If a caller, such as memory compaction, builds its own freelist for migration targets, this can reuse already freed memory instead of scanning additional memory. If the caller provides a function to handle freeing of destination pages, it is called when page migration fails. If the caller passes NULL then freeing back to the system will be handled as usual. This patch introduces no functional change. Signed-off-by: David Rientjes Reviewed-by: Naoya Horiguchi Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- mm/memory-failure.c | 4 ++-- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 4 ++-- mm/migrate.c | 55 +++++++++++++++++++++++++++++++++++++---------------- mm/page_alloc.c | 2 +- 6 files changed, 46 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 6010aabde28c..f74a362d2e28 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1016,7 +1016,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } nr_migrate = cc->nr_migratepages; - err = migrate_pages(&cc->migratepages, compaction_alloc, + err = migrate_pages(&cc->migratepages, compaction_alloc, NULL, (unsigned long)cc, cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, MR_COMPACTION); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d50f17fb9be2..3cd1b652821c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1503,7 +1503,7 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_move(&hpage->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", @@ -1584,7 +1584,7 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { if (!list_empty(&pagelist)) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cbb7ca0ac44b..469bbf505f85 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1394,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * alloc_migrate_target should be improooooved!! * migrate_pages returns # of failed pages. */ - ret = migrate_pages(&source, alloc_migrate_target, 0, + ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7f7864b95e8e..16bc9fa42998 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, + err = migrate_pages(&pagelist, new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, + NULL, (unsigned long)vma, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index 6247be7fa30e..2a459675eeab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -938,8 +938,9 @@ out: * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, enum migrate_mode mode) +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct page *page, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -983,11 +984,17 @@ out: page_is_file_cache(page)); putback_lru_page(page); } + /* - * Move the new page to the LRU. If migration was not successful - * then this will free the page. + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, putback_lru_page() will drop the reference grabbed + * during isolation. */ - putback_lru_page(newpage); + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(newpage, private); + else + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -1016,8 +1023,9 @@ out: * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_page_t get_new_page, - unsigned long private, struct page *hpage, - int force, enum migrate_mode mode) + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, 1, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) remove_migration_ptes(hpage, hpage); if (anon_vma) put_anon_vma(anon_vma); - if (!rc) + if (rc == MIGRATEPAGE_SUCCESS) hugetlb_cgroup_migrate(hpage, new_hpage); unlock_page(hpage); out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - put_page(new_hpage); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(new_hpage, private); + else + put_page(new_hpage); + if (result) { if (rc) *result = rc; @@ -1086,6 +1104,8 @@ out: * @from: The list of pages to be migrated. * @get_new_page: The function used to allocate free pages to be used * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for * page migration, if any. @@ -1099,7 +1119,8 @@ out: * Returns the number of pages that were not migrated, or an error code. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode, int reason) + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, mode); + put_new_page, private, page, + pass > 2, mode); else - rc = unmap_and_move(get_new_page, private, - page, pass > 2, mode); + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1273,7 +1295,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, + err = migrate_pages(&pagelist, new_page_node, NULL, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 132c337dbe55..027d0294413a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6218,7 +6218,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, MIGRATE_SYNC, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); -- cgit v1.2.3 From d53aea3d46d64e95da9952887969f7533b9ab25e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:26 -0700 Subject: mm, compaction: return failed migration target pages back to freelist Greg reported that he found isolated free pages were returned back to the VM rather than the compaction freelist. This will cause holes behind the free scanner and cause it to reallocate additional memory if necessary later. He detected the problem at runtime seeing that ext4 metadata pages (esp the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were constantly visited by compaction calls of migrate_pages(). These pages had a non-zero b_count which caused fallback_migrate_page() -> try_to_release_page() -> try_to_free_buffers() to fail. Memory compaction works by having a "freeing scanner" scan from one end of a zone which isolates pages as migration targets while another "migrating scanner" scans from the other end of the same zone which isolates pages for migration. When page migration fails for an isolated page, the target page is returned to the system rather than the freelist built by the freeing scanner. This may require the freeing scanner to continue scanning memory after suitable migration targets have already been returned to the system needlessly. This patch returns destination pages to the freeing scanner freelist when page migration fails. This prevents unnecessary work done by the freeing scanner but also encourages memory to be as compacted as possible at the end of the zone. Signed-off-by: David Rientjes Reported-by: Greg Thelen Acked-by: Mel Gorman Acked-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index f74a362d2e28..d0c7c994e11b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -790,23 +790,32 @@ static struct page *compaction_alloc(struct page *migratepage, } /* - * We cannot control nr_migratepages and nr_freepages fully when migration is - * running as migrate_pages() has no knowledge of compact_control. When - * migration is complete, we count the number of pages on the lists by hand. + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no + * special handling needed for NUMA. + */ +static void compaction_free(struct page *page, unsigned long data) +{ + struct compact_control *cc = (struct compact_control *)data; + + list_add(&page->lru, &cc->freepages); + cc->nr_freepages++; +} + +/* + * We cannot control nr_migratepages fully when migration is running as + * migrate_pages() has no knowledge of of compact_control. When migration is + * complete, we count the number of pages on the list by hand. */ static void update_nr_listpages(struct compact_control *cc) { int nr_migratepages = 0; - int nr_freepages = 0; struct page *page; list_for_each_entry(page, &cc->migratepages, lru) nr_migratepages++; - list_for_each_entry(page, &cc->freepages, lru) - nr_freepages++; cc->nr_migratepages = nr_migratepages; - cc->nr_freepages = nr_freepages; } /* possible outcome of isolate_migratepages */ @@ -1016,8 +1025,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } nr_migrate = cc->nr_migratepages; - err = migrate_pages(&cc->migratepages, compaction_alloc, NULL, - (unsigned long)cc, + err = migrate_pages(&cc->migratepages, compaction_alloc, + compaction_free, (unsigned long)cc, cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, MR_COMPACTION); update_nr_listpages(cc); -- cgit v1.2.3 From 35979ef3393110ff3c12c6b94552208d3bdf1a36 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:27 -0700 Subject: mm, compaction: add per-zone migration pfn cache for async compaction Each zone has a cached migration scanner pfn for memory compaction so that subsequent calls to memory compaction can start where the previous call left off. Currently, the compaction migration scanner only updates the per-zone cached pfn when pageblocks were not skipped for async compaction. This creates a dependency on calling sync compaction to avoid having subsequent calls to async compaction from scanning an enormous amount of non-MOVABLE pageblocks each time it is called. On large machines, this could be potentially very expensive. This patch adds a per-zone cached migration scanner pfn only for async compaction. It is updated everytime a pageblock has been scanned in its entirety and when no pages from it were successfully isolated. The cached migration scanner pfn for sync compaction is updated only when called for sync compaction. Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Cc: Greg Thelen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 66 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d0c7c994e11b..70c0f8cda33f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; - zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = start_pfn; + zone->compact_cached_migrate_pfn[1] = start_pfn; zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_flush = false; @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { struct zone *zone = cc->zone; + unsigned long pfn; if (cc->ignore_skip_hint) return; @@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (!nr_isolated) { - unsigned long pfn = page_to_pfn(page); + if (nr_isolated) + return; + + /* + * Only skip pageblocks when all forms of compaction will be known to + * fail in the near future. + */ + if (set_unsuitable) set_pageblock_skip(page); - /* Update where compaction should restart */ - if (migrate_scanner) { - if (!cc->finished_update_migrate && - pfn > zone->compact_cached_migrate_pfn) - zone->compact_cached_migrate_pfn = pfn; - } else { - if (!cc->finished_update_free && - pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + pfn = page_to_pfn(page); + + /* Update where async and sync compaction should restart */ + if (migrate_scanner) { + if (cc->finished_update_migrate) + return; + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; + } else { + if (cc->finished_update_free) + return; + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } } #else @@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -323,7 +336,8 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); + update_pageblock_skip(cc, valid_page, total_isolated, true, + false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -458,7 +472,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; - bool skipped_async_unsuitable = false; + bool set_unsuitable = true; const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | (unevictable ? ISOLATE_UNEVICTABLE : 0); @@ -535,8 +549,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ mt = get_pageblock_migratetype(page); if (!cc->sync && !migrate_async_suitable(mt)) { - cc->finished_update_migrate = true; - skipped_async_unsuitable = true; + set_unsuitable = false; goto next_pageblock; } } @@ -640,11 +653,10 @@ next_pageblock: /* * Update the pageblock-skip information and cached scanner pfn, * if the whole pageblock was scanned without isolating any page. - * This is not done when pageblock was skipped due to being unsuitable - * for async compaction, so that eventual sync compaction can try. */ - if (low_pfn == end_pfn && !skipped_async_unsuitable) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + if (low_pfn == end_pfn) + update_pageblock_skip(cc, valid_page, nr_isolated, + set_unsuitable, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -868,7 +880,8 @@ static int compact_finished(struct zone *zone, /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { /* Let the next compaction start anew. */ - zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = zone_end_pfn(zone); /* @@ -993,7 +1006,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -1001,7 +1014,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); -- cgit v1.2.3 From e0b9daeb453e602a95ea43853dc12d385558ce1f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:28 -0700 Subject: mm, compaction: embed migration mode in compact_control We're going to want to manipulate the migration mode for compaction in the page allocator, and currently compact_control's sync field is only a bool. Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending on the value of this bool. Convert the bool to enum migrate_mode and pass the migration mode in directly. Later, we'll want to avoid MIGRATE_SYNC_LIGHT for thp allocations in the pagefault patch to avoid unnecessary latency. This also alters compaction triggered from sysfs, either for the entire system or for a node, to force MIGRATE_SYNC. [akpm@linux-foundation.org: fix build] [iamjoonsoo.kim@lge.com: use MIGRATE_SYNC in alloc_contig_range()] Signed-off-by: David Rientjes Suggested-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Greg Thelen Cc: Naoya Horiguchi Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 36 +++++++++++++++++++----------------- mm/internal.h | 2 +- mm/page_alloc.c | 39 +++++++++++++++++---------------------- 3 files changed, 37 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 70c0f8cda33f..217a6ad9a20e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc, return; if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; - if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1]) + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) zone->compact_cached_migrate_pfn[1] = pfn; } else { if (cc->finished_update_free) @@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } /* async aborts if taking too long or contended */ - if (!cc->sync) { + if (cc->mode == MIGRATE_ASYNC) { cc->contended = true; return false; } @@ -473,7 +474,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, bool locked = false; struct page *page = NULL, *valid_page = NULL; bool set_unsuitable = true; - const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | + const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? + ISOLATE_ASYNC_MIGRATE : 0) | (unevictable ? ISOLATE_UNEVICTABLE : 0); /* @@ -483,7 +485,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (!cc->sync) + if (cc->mode == MIGRATE_ASYNC) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -548,7 +550,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * the minimum amount of work satisfies the allocation */ mt = get_pageblock_migratetype(page); - if (!cc->sync && !migrate_async_suitable(mt)) { + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(mt)) { set_unsuitable = false; goto next_pageblock; } @@ -981,6 +984,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -1006,7 +1010,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync]; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -1040,8 +1044,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - compaction_free, (unsigned long)cc, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -1074,9 +1077,8 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync, bool *contended) +static unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum migrate_mode mode, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1085,7 +1087,7 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .sync = sync, + .mode = mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1107,7 +1109,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from - * @sync: Whether migration is synchronous or not + * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that is true if compaction was aborted due to lock contention * @page: Optionally capture a free page of the requested order during compaction * @@ -1115,7 +1117,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1140,7 +1142,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, + status = compact_zone_order(zone, order, gfp_mask, mode, contended); rc = max(status, rc); @@ -1190,7 +1192,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .sync = false, + .mode = MIGRATE_ASYNC, }; if (!order) @@ -1203,7 +1205,7 @@ static void compact_node(int nid) { struct compact_control cc = { .order = -1, - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; diff --git a/mm/internal.h b/mm/internal.h index 6ee580d69ddd..a25424a24e0c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -134,7 +134,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ + enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool finished_update_free; /* True when the zone cached pfns are * no longer being updated diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 027d0294413a..afb29da0576c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2217,7 +2217,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, + int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2231,7 +2231,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration, + nodemask, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; @@ -2264,7 +2264,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * As async compaction considers a subset of pageblocks, only * defer if the failure was a sync compaction failure. */ - if (sync_migration) + if (mode != MIGRATE_ASYNC) defer_compaction(preferred_zone, order); cond_resched(); @@ -2277,9 +2277,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int migratetype, enum migrate_mode mode, bool *contended_compaction, + bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } @@ -2474,7 +2473,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - bool sync_migration = false; + enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; bool contended_compaction = false; @@ -2568,17 +2567,15 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; - sync_migration = true; + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because @@ -2653,12 +2650,10 @@ rebalance: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -6218,7 +6213,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -6257,7 +6252,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); -- cgit v1.2.3 From 75f30861a12a6b09b759dfeeb9290b681af89057 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:30 -0700 Subject: mm, thp: avoid excessive compaction latency during fault Synchronous memory compaction can be very expensive: it can iterate an enormous amount of memory without aborting, constantly rescheduling, waiting on page locks and lru_lock, etc, if a pageblock cannot be defragmented. Unfortunately, it's too expensive for transparent hugepage page faults and it's much better to simply fallback to pages. On 128GB machines, we find that synchronous memory compaction can take O(seconds) for a single thp fault. Now that async compaction remembers where it left off without strictly relying on sync compaction, this makes thp allocations best-effort without causing egregious latency during fault. We still need to retry async compaction after reclaim, but this won't stall for seconds. Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Greg Thelen Cc: Naoya Horiguchi Cc: Vlastimil Babka Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index afb29da0576c..d88d67584765 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2575,7 +2575,14 @@ rebalance: &did_some_progress); if (page) goto got_pg; - migration_mode = MIGRATE_SYNC_LIGHT; + + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because -- cgit v1.2.3 From aeef4b83806f49a0c454b7d4578671b71045bee2 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:31 -0700 Subject: mm, compaction: terminate async compaction when rescheduling Async compaction terminates prematurely when need_resched(), see compact_checklock_irqsave(). This can never trigger, however, if the cond_resched() in isolate_migratepages_range() always takes care of the scheduling. If the cond_resched() actually triggers, then terminate this pageblock scan for async compaction as well. Signed-off-by: David Rientjes Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 217a6ad9a20e..56331f5124ba 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -494,8 +494,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, return 0; } + if (cond_resched()) { + /* Async terminates prematurely on need_resched() */ + if (cc->mode == MIGRATE_ASYNC) + return 0; + } + /* Time to isolate some pages for migration */ - cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { /* give a chance to irqs before checking need_resched() */ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { -- cgit v1.2.3 From f8c9301fa5a2a8b873c67f2a3d8230d5c13f61b7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:08:32 -0700 Subject: mm/compaction: do not count migratepages when unnecessary During compaction, update_nr_listpages() has been used to count remaining non-migrated and free pages after a call to migrage_pages(). The freepages counting has become unneccessary, and it turns out that migratepages counting is also unnecessary in most cases. The only situation when it's needed to count cc->migratepages is when migrate_pages() returns with a negative error code. Otherwise, the non-negative return value is the number of pages that were not migrated, which is exactly the count of remaining pages in the cc->migratepages list. Furthermore, any non-zero count is only interesting for the tracepoint of mm_compaction_migratepages events, because after that all remaining unmigrated pages are put back and their count is set to 0. This patch therefore removes update_nr_listpages() completely, and changes the tracepoint definition so that the manual counting is done only when the tracepoint is enabled, and only when migrate_pages() returns a negative error code. Furthermore, migrate_pages() and the tracepoints won't be called when there's nothing to migrate. This potentially avoids some wasted cycles and reduces the volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0 nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events. The mm_compaction_isolate_migratepages event is better for determining that nothing was isolated for migration, and this one was just duplicating the info. Signed-off-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Acked-by: Michal Nazarewicz Cc: Christoph Lameter Cc: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 56331f5124ba..3c60e3d5237e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -822,22 +822,6 @@ static void compaction_free(struct page *page, unsigned long data) cc->nr_freepages++; } -/* - * We cannot control nr_migratepages fully when migration is running as - * migrate_pages() has no knowledge of of compact_control. When migration is - * complete, we count the number of pages on the list by hand. - */ -static void update_nr_listpages(struct compact_control *cc) -{ - int nr_migratepages = 0; - struct page *page; - - list_for_each_entry(page, &cc->migratepages, lru) - nr_migratepages++; - - cc->nr_migratepages = nr_migratepages; -} - /* possible outcome of isolate_migratepages */ typedef enum { ISOLATE_ABORT, /* Abort compaction now */ @@ -1032,7 +1016,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { - unsigned long nr_migrate, nr_remaining; int err; switch (isolate_migratepages(zone, cc)) { @@ -1047,20 +1030,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - nr_migrate = cc->nr_migratepages; + if (!cc->nr_migratepages) + continue; + err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); - update_nr_listpages(cc); - nr_remaining = cc->nr_migratepages; - trace_mm_compaction_migratepages(nr_migrate - nr_remaining, - nr_remaining); + trace_mm_compaction_migratepages(cc->nr_migratepages, err, + &cc->migratepages); - /* Release isolated pages not migrated */ + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; if (err) { putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it -- cgit v1.2.3 From e9ade569910a82614ff5f2c2cea2b65a8d785da4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:08:34 -0700 Subject: mm/compaction: avoid rescanning pageblocks in isolate_freepages The compaction free scanner in isolate_freepages() currently remembers PFN of the highest pageblock where it successfully isolates, to be used as the starting pageblock for the next invocation. The rationale behind this is that page migration might return free pages to the allocator when migration fails and we don't want to skip them if the compaction continues. Since migration now returns free pages back to compaction code where they can be reused, this is no longer a concern. This patch changes isolate_freepages() so that the PFN for restarting is updated with each pageblock where isolation is attempted. Using stress-highalloc from mmtests, this resulted in 10% reduction of the pages scanned by the free scanner. Note that the somewhat similar functionality that records highest successful pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is used when the whole compaction is restarted, not for multiple invocations of the free scanner during single compaction. Signed-off-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Acked-by: Michal Nazarewicz Reviewed-by: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 3c60e3d5237e..58441220b953 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -688,7 +688,6 @@ static void isolate_freepages(struct zone *zone, unsigned long block_start_pfn; /* start of current pageblock */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ - unsigned long next_free_pfn; /* start pfn for scaning at next round */ int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -708,12 +707,6 @@ static void isolate_freepages(struct zone *zone, zone_end_pfn(zone)); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); - /* - * If no pages are isolated, the block_start_pfn < low_pfn check - * will kick in. - */ - next_free_pfn = 0; - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate @@ -754,19 +747,19 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ + cc->free_pfn = block_start_pfn; isolated = isolate_freepages_block(cc, block_start_pfn, block_end_pfn, freelist, false); nr_freepages += isolated; /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator + * Set a flag that we successfully isolated in this pageblock. + * In the next loop iteration, zone->compact_cached_free_pfn + * will not be updated and thus it will effectively contain the + * highest pageblock we isolated pages from. */ - if (isolated && next_free_pfn == 0) { + if (isolated) cc->finished_update_free = true; - next_free_pfn = block_start_pfn; - } } /* split_free_page does not map the pages */ @@ -777,9 +770,8 @@ static void isolate_freepages(struct zone *zone, * so that compact_finished() may detect this */ if (block_start_pfn < low_pfn) - next_free_pfn = cc->migrate_pfn; + cc->free_pfn = cc->migrate_pfn; - cc->free_pfn = next_free_pfn; cc->nr_freepages = nr_freepages; } -- cgit v1.2.3 From bea04b073292b2acb522c7c1aa67a4fc58151530 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:09:51 -0700 Subject: mm: use the light version __mod_zone_page_state in mlocked_vma_newpage() mlocked_vma_newpage() is called with pte lock held(a spinlock), which implies preemtion disabled, and the vm stat counter is not modified from interrupt context, so we need not use an irq-safe mod_zone_page_state() here, using a light-weight version __mod_zone_page_state() would be OK. This patch also documents __mod_zone_page_state() and some of its callsites. The comment above __mod_zone_page_state() is from Hugh Dickins, and acked by Christoph. Most credits to Hugh and Christoph for the clarification on the usage of the __mod_zone_page_state(). [akpm@linux-foundation.org: coding-style fixes] Suggested-by: Andrew Morton Acked-by: Hugh Dickins Signed-off-by: Jianyu Zhan Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 7 ++++++- mm/rmap.c | 11 +++++++++++ mm/vmstat.c | 4 +++- 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index a25424a24e0c..e067984bafa0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -201,7 +201,12 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, return 0; if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); } diff --git a/mm/rmap.c b/mm/rmap.c index 8754e1fa83b6..4644e10248f0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -988,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page, { int first = atomic_inc_and_test(&page->_mapcount); if (first) { + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption + * disabled. + */ if (PageTransHuge(page)) __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); @@ -1079,6 +1085,11 @@ void page_remove_rmap(struct page *page) /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. + * + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. */ if (unlikely(PageHuge(page))) goto out; diff --git a/mm/vmstat.c b/mm/vmstat.c index 376bd2d21482..b37bd49bfd55 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -207,7 +207,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, } /* - * For use when we know that interrupts are disabled. + * For use when we know that interrupts are disabled, + * or when we know that preemption is disabled and that + * particular counter cannot be updated from interrupt context. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) -- cgit v1.2.3 From 7ee07a44eb53374a73544ae14c71366a02d462e0 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:09:52 -0700 Subject: mm: fold mlocked_vma_newpage() into its only call site In previous commit(mm: use the light version __mod_zone_page_state in mlocked_vma_newpage()) a irq-unsafe __mod_zone_page_state is used. And as suggested by Andrew, to reduce the risks that new call sites incorrectly using mlocked_vma_newpage() without knowing they are adding racing, this patch folds mlocked_vma_newpage() into its only call site, page_add_new_anon_rmap, to make it open-cocded for people to know what is going on. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jianyu Zhan Suggested-by: Andrew Morton Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 29 ----------------------------- mm/rmap.c | 20 +++++++++++++++++--- 2 files changed, 17 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index e067984bafa0..802c3a4fc03a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -188,31 +188,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } -/* - * Called only in fault path, to determine if a new page is being - * mapped into a LOCKED vma. If it is, mark page as mlocked. - */ -static inline int mlocked_vma_newpage(struct vm_area_struct *vma, - struct page *page) -{ - VM_BUG_ON_PAGE(PageLRU(page), page); - - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) - return 0; - - if (!TestSetPageMlocked(page)) { - /* - * We use the irq-unsafe __mod_zone_page_stat because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); - } - return 1; -} - /* * must be called with vma's mmap_sem held for read or write, and page locked. */ @@ -255,10 +230,6 @@ extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); #endif #else /* !CONFIG_MMU */ -static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) -{ - return 0; -} static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } diff --git a/mm/rmap.c b/mm/rmap.c index 4644e10248f0..e375ce4bd93e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1032,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page, __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - if (!mlocked_vma_newpage(vma, page)) { + + VM_BUG_ON_PAGE(PageLRU(page), page); + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { SetPageActive(page); lru_cache_add(page); - } else - add_page_to_unevictable_list(page); + return; + } + + if (!TestSetPageMlocked(page)) { + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + add_page_to_unevictable_list(page); } /** -- cgit v1.2.3 From adfab836f4908deb049a5128082719e689eed964 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:09:53 -0700 Subject: swap: change swap_info singly-linked list to list_head The logic controlling the singly-linked list of swap_info_struct entries for all active, i.e. swapon'ed, swap targets is rather complex, because: - it stores the entries in priority order - there is a pointer to the highest priority entry - there is a pointer to the highest priority not-full entry - there is a highest_priority_index variable set outside the swap_lock - swap entries of equal priority should be used equally this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181 where different priority swap targets are incorrectly used equally. That bug probably could be solved with the existing singly-linked lists, but I think it would only add more complexity to the already difficult to understand get_swap_page() swap_list iteration logic. The first patch changes from a singly-linked list to a doubly-linked list using list_heads; the highest_priority_index and related code are removed and get_swap_page() starts each iteration at the highest priority swap_info entry, even if it's full. While this does introduce unnecessary list iteration (i.e. Schlemiel the painter's algorithm) in the case where one or more of the highest priority entries are full, the iteration and manipulation code is much simpler and behaves correctly re: the above bug; and the fourth patch removes the unnecessary iteration. The second patch adds some minor plist helper functions; nothing new really, just functions to match existing regular list functions. These are used by the next two patches. The third patch adds plist_requeue(), which is used by get_swap_page() in the next patch - it performs the requeueing of same-priority entries (which moves the entry to the end of its priority in the plist), so that all equal-priority swap_info_structs get used equally. The fourth patch converts the main list into a plist, and adds a new plist that contains only swap_info entries that are both active and not full. As Mel suggested using plists allows removing all the ordering code from swap - plists handle ordering automatically. The list naming is also clarified now that there are two lists, with the original list changed from swap_list_head to swap_active_head and the new list named swap_avail_head. A new spinlock is also added for the new list, so swap_info entries can be added or removed from the new list immediately as they become full or not full. This patch (of 4): Replace the singly-linked list tracking active, i.e. swapon'ed, swap_info_struct entries with a doubly-linked list using struct list_heads. Simplify the logic iterating and manipulating the list of entries, especially get_swap_page(), by using standard list_head functions, and removing the highest priority iteration logic. The change fixes the bug: https://lkml.org/lkml/2014/2/13/181 in which different priority swap entries after the highest priority entry are incorrectly used equally in pairs. The swap behavior is now as advertised, i.e. different priority swap entries are used in order, and equal priority swap targets are used concurrently. Signed-off-by: Dan Streetman Acked-by: Mel Gorman Cc: Shaohua Li Cc: Hugh Dickins Cc: Dan Streetman Cc: Michal Hocko Cc: Christian Ehrhardt Cc: Weijie Yang Cc: Rik van Riel Cc: Johannes Weiner Cc: Bob Liu Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Gortmaker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 13 ++--- mm/swapfile.c | 171 ++++++++++++++++++++++++--------------------------------- 2 files changed, 76 insertions(+), 108 deletions(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index 1b24bdcb3197..fae11602e8a9 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); static unsigned long __frontswap_curr_pages(void) { - int type; unsigned long totalpages = 0; struct swap_info_struct *si = NULL; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + list_for_each_entry(si, &swap_list_head, list) totalpages += atomic_read(&si->frontswap_pages); - } return totalpages; } @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, int si_frontswap_pages; unsigned long total_pages_to_unuse = total; unsigned long pages = 0, pages_to_unuse = 0; - int type; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + list_for_each_entry(si, &swap_list_head, list) { si_frontswap_pages = atomic_read(&si->frontswap_pages); if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } vm_unacct_memory(pages); *unused = pages_to_unuse; - *swapid = type; + *swapid = si->type; ret = 0; break; } @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) /* * we don't want to hold swap_lock while doing a very * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_list.head each time + * so restart scan from swap_list_head each time */ spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); diff --git a/mm/swapfile.c b/mm/swapfile.c index 4a7f7e6992b6..6c95a8c63b1a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages; /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; -static atomic_t highest_priority_index = ATOMIC_INIT(-1); static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -struct swap_list_t swap_list = {-1, -1}; +/* + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +LIST_HEAD(swap_list_head); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -640,66 +643,54 @@ no_page: swp_entry_t get_swap_page(void) { - struct swap_info_struct *si; + struct swap_info_struct *si, *next; pgoff_t offset; - int type, next; - int wrapped = 0; - int hp_index; + struct list_head *tmp; spin_lock(&swap_lock); if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; atomic_long_dec(&nr_swap_pages); - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - hp_index = atomic_xchg(&highest_priority_index, -1); - /* - * highest_priority_index records current highest priority swap - * type which just frees swap entries. If its priority is - * higher than that of swap_list.next swap type, we use it. It - * isn't protected by swap_lock, so it can be an invalid value - * if the corresponding swap type is swapoff. We double check - * the flags here. It's even possible the swap type is swapoff - * and swapon again and its priority is changed. In such rare - * case, low prority swap type might be used, but eventually - * high priority swap will be used after several rounds of - * swap. - */ - if (hp_index != -1 && hp_index != type && - swap_info[type]->prio < swap_info[hp_index]->prio && - (swap_info[hp_index]->flags & SWP_WRITEOK)) { - type = hp_index; - swap_list.next = type; - } - - si = swap_info[type]; - next = si->next; - if (next < 0 || - (!wrapped && si->prio != swap_info[next]->prio)) { - next = swap_list.head; - wrapped++; - } - + list_for_each(tmp, &swap_list_head) { + si = list_entry(tmp, typeof(*si), list); spin_lock(&si->lock); - if (!si->highest_bit) { - spin_unlock(&si->lock); - continue; - } - if (!(si->flags & SWP_WRITEOK)) { + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_unlock(&si->lock); continue; } - swap_list.next = next; + /* + * rotate the current swap_info that we're going to use + * to after any other swap_info that have the same prio, + * so that all equal-priority swap_info get used equally + */ + next = si; + list_for_each_entry_continue(next, &swap_list_head, list) { + if (si->prio != next->prio) + break; + list_rotate_left(&si->list); + next = si; + } spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); spin_unlock(&si->lock); if (offset) - return swp_entry(type, offset); + return swp_entry(si->type, offset); spin_lock(&swap_lock); - next = swap_list.next; + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map() can drop the si->lock, multiple + * callers probably all tried to get a page from the same si + * and it filled up before we could get one. So we need to + * try again. Since we dropped the swap_lock, there may now + * be non-full higher priority swap_infos, and this si may have + * even been removed from the list (although very unlikely). + * Let's start over. + */ + tmp = &swap_list_head; } atomic_long_inc(&nr_swap_pages); @@ -766,27 +757,6 @@ out: return NULL; } -/* - * This swap type frees swap entry, check if it is the highest priority swap - * type which just frees swap entry. get_swap_page() uses - * highest_priority_index to search highest priority swap type. The - * swap_info_struct.lock can't protect us if there are multiple swap types - * active, so we use atomic_cmpxchg. - */ -static void set_highest_priority_index(int type) -{ - int old_hp_index, new_hp_index; - - do { - old_hp_index = atomic_read(&highest_priority_index); - if (old_hp_index != -1 && - swap_info[old_hp_index]->prio >= swap_info[type]->prio) - break; - new_hp_index = type; - } while (atomic_cmpxchg(&highest_priority_index, - old_hp_index, new_hp_index) != old_hp_index); -} - static unsigned char swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -830,7 +800,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - set_highest_priority_index(p->type); atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); @@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - int i, prev; + struct swap_info_struct *si; if (prio >= 0) p->prio = prio; @@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; + assert_spin_locked(&swap_lock); + BUG_ON(!list_empty(&p->list)); + /* + * insert into swap list; the list is in priority order, + * so that get_swap_page() can get a page from the highest + * priority swap_info_struct with available page(s), and + * swapoff can adjust the auto-assigned (i.e. negative) prio + * values for any lower-priority swap_info_structs when + * removing a negative-prio swap_info_struct + */ + list_for_each_entry(si, &swap_list_head, list) { + if (p->prio >= si->prio) { + list_add_tail(&p->list, &si->list); + return; + } } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = p->type; - else - swap_info[prev]->next = p->type; + /* + * this covers two cases: + * 1) p->prio is less than all existing prio + * 2) the swap list is empty + */ + list_add_tail(&p->list, &swap_list_head); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - int i, type, prev; - int err; + int err, found = 0; unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) @@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out; mapping = victim->f_mapping; - prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { - p = swap_info[type]; + list_for_each_entry(p, &swap_list_head, list) { if (p->flags & SWP_WRITEOK) { - if (p->swap_file->f_mapping == mapping) + if (p->swap_file->f_mapping == mapping) { + found = 1; break; + } } - prev = type; } - if (type < 0) { + if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; @@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) - swap_list.head = p->next; - else - swap_info[prev]->next = p->next; - if (type == swap_list.next) { - /* just pick something that's safe... */ - swap_list.next = swap_list.head; - } spin_lock(&p->lock); if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i]->next) - swap_info[i]->prio = p->prio--; + struct swap_info_struct *si = p; + + list_for_each_entry_continue(si, &swap_list_head, list) { + si->prio++; + } least_priority++; } + list_del_init(&p->list); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); set_current_oom_origin(); - err = try_to_unuse(type, false, 0); /* force all pages to be unused */ + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ clear_current_oom_origin(); if (err) { @@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); - frontswap_invalidate_area(type); + frontswap_invalidate_area(p->type); frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); @@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(cluster_info); vfree(frontswap_map); /* Destroy swap account information */ - swap_cgroup_swapoff(type); + swap_cgroup_swapoff(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_swap_info(void) */ } INIT_LIST_HEAD(&p->first_swap_extent.list); + INIT_LIST_HEAD(&p->list); p->flags = SWP_USED; - p->next = -1; spin_unlock(&swap_lock); spin_lock_init(&p->lock); -- cgit v1.2.3 From 18ab4d4ced0817421e6db6940374cc39d28d65da Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:09:59 -0700 Subject: swap: change swap_list_head to plist, add swap_avail_head Originally get_swap_page() started iterating through the singly-linked list of swap_info_structs using swap_list.next or highest_priority_index, which both were intended to point to the highest priority active swap target that was not full. The first patch in this series changed the singly-linked list to a doubly-linked list, and removed the logic to start at the highest priority non-full entry; it starts scanning at the highest priority entry each time, even if the entry is full. Replace the manually ordered swap_list_head with a plist, swap_active_head. Add a new plist, swap_avail_head. The original swap_active_head plist contains all active swap_info_structs, as before, while the new swap_avail_head plist contains only swap_info_structs that are active and available, i.e. not full. Add a new spinlock, swap_avail_lock, to protect the swap_avail_head list. Mel Gorman suggested using plists since they internally handle ordering the list entries based on priority, which is exactly what swap was doing manually. All the ordering code is now removed, and swap_info_struct entries and simply added to their corresponding plist and automatically ordered correctly. Using a new plist for available swap_info_structs simplifies and optimizes get_swap_page(), which no longer has to iterate over full swap_info_structs. Using a new spinlock for swap_avail_head plist allows each swap_info_struct to add or remove themselves from the plist when they become full or not-full; previously they could not do so because the swap_info_struct->lock is held when they change from full<->not-full, and the swap_lock protecting the main swap_active_head must be ordered before any swap_info_struct->lock. Signed-off-by: Dan Streetman Acked-by: Mel Gorman Cc: Shaohua Li Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Dan Streetman Cc: Michal Hocko Cc: Christian Ehrhardt Cc: Weijie Yang Cc: Rik van Riel Cc: Johannes Weiner Cc: Bob Liu Cc: Paul Gortmaker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 6 +-- mm/swapfile.c | 145 ++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 94 insertions(+), 57 deletions(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index fae11602e8a9..c30eec536f03 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pages(void) struct swap_info_struct *si = NULL; assert_spin_locked(&swap_lock); - list_for_each_entry(si, &swap_list_head, list) + plist_for_each_entry(si, &swap_active_head, list) totalpages += atomic_read(&si->frontswap_pages); return totalpages; } @@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, unsigned long pages = 0, pages_to_unuse = 0; assert_spin_locked(&swap_lock); - list_for_each_entry(si, &swap_list_head, list) { + plist_for_each_entry(si, &swap_active_head, list) { si_frontswap_pages = atomic_read(&si->frontswap_pages); if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; @@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) /* * we don't want to hold swap_lock while doing a very * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_list_head each time + * so restart scan from swap_active_head each time */ spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c95a8c63b1a..beeeef8a1b2d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unused swap offset entry "; * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ -LIST_HEAD(swap_list_head); +PLIST_HEAD(swap_active_head); + +/* + * all available (active, not full) swap_info_structs + * protected with swap_avail_lock, ordered by priority. + * This is used by get_swap_page() instead of swap_active_head + * because swap_active_head includes all swap_info_structs, + * but get_swap_page() doesn't need to look at full ones. + * This uses its own lock instead of swap_lock because when a + * swap_info_struct changes between not-full/full, it needs to + * add/remove itself to/from this list, but the swap_info_struct->lock + * is held and the locking order requires swap_lock to be taken + * before any swap_info_struct->lock. + */ +static PLIST_HEAD(swap_avail_head); +static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -594,6 +609,9 @@ checks: if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } si->swap_map[offset] = usage; inc_cluster_info_page(si, si->cluster_info, offset); @@ -645,57 +663,63 @@ swp_entry_t get_swap_page(void) { struct swap_info_struct *si, *next; pgoff_t offset; - struct list_head *tmp; - spin_lock(&swap_lock); if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; atomic_long_dec(&nr_swap_pages); - list_for_each(tmp, &swap_list_head) { - si = list_entry(tmp, typeof(*si), list); + spin_lock(&swap_avail_lock); + +start_over: + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + /* requeue si to after same-priority siblings */ + plist_requeue(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + if (plist_node_empty(&si->avail_list)) { + spin_unlock(&si->lock); + goto nextsi; + } + WARN(!si->highest_bit, + "swap_info %d in list but !highest_bit\n", + si->type); + WARN(!(si->flags & SWP_WRITEOK), + "swap_info %d in list but !SWP_WRITEOK\n", + si->type); + plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&si->lock); - continue; + goto nextsi; } - /* - * rotate the current swap_info that we're going to use - * to after any other swap_info that have the same prio, - * so that all equal-priority swap_info get used equally - */ - next = si; - list_for_each_entry_continue(next, &swap_list_head, list) { - if (si->prio != next->prio) - break; - list_rotate_left(&si->list); - next = si; - } - - spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); spin_unlock(&si->lock); if (offset) return swp_entry(si->type, offset); - spin_lock(&swap_lock); + pr_debug("scan_swap_map of si %d failed to find offset\n", + si->type); + spin_lock(&swap_avail_lock); +nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map() can drop the si->lock, multiple * callers probably all tried to get a page from the same si - * and it filled up before we could get one. So we need to - * try again. Since we dropped the swap_lock, there may now - * be non-full higher priority swap_infos, and this si may have - * even been removed from the list (although very unlikely). - * Let's start over. + * and it filled up before we could get one; or, the si filled + * up between us dropping swap_avail_lock and taking si->lock. + * Since we dropped the swap_avail_lock, the swap_avail_head + * list may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over. */ - tmp = &swap_list_head; + if (plist_node_empty(&next->avail_list)) + goto start_over; } + spin_unlock(&swap_avail_lock); + atomic_long_inc(&nr_swap_pages); noswap: - spin_unlock(&swap_lock); return (swp_entry_t) {0}; } @@ -798,8 +822,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; - if (offset > p->highest_bit) + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; p->highest_bit = offset; + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); @@ -1734,12 +1768,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - struct swap_info_struct *si; - if (prio >= 0) p->prio = prio; else p->prio = --least_priority; + /* + * the plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + p->list.prio = -p->prio; + p->avail_list.prio = -p->prio; p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; @@ -1747,27 +1785,20 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, total_swap_pages += p->pages; assert_spin_locked(&swap_lock); - BUG_ON(!list_empty(&p->list)); - /* - * insert into swap list; the list is in priority order, - * so that get_swap_page() can get a page from the highest - * priority swap_info_struct with available page(s), and - * swapoff can adjust the auto-assigned (i.e. negative) prio - * values for any lower-priority swap_info_structs when - * removing a negative-prio swap_info_struct - */ - list_for_each_entry(si, &swap_list_head, list) { - if (p->prio >= si->prio) { - list_add_tail(&p->list, &si->list); - return; - } - } /* - * this covers two cases: - * 1) p->prio is less than all existing prio - * 2) the swap list is empty + * both lists are plists, and thus priority ordered. + * swap_active_head needs to be priority ordered for swapoff(), + * which on removal of any swap_info_struct with an auto-assigned + * (i.e. negative) priority increments the auto-assigned priority + * of any lower-priority swap_info_structs. + * swap_avail_head needs to be priority ordered for get_swap_page(), + * which allocates swap pages from the highest available priority + * swap_info_struct. */ - list_add_tail(&p->list, &swap_list_head); + plist_add(&p->list, &swap_active_head); + spin_lock(&swap_avail_lock); + plist_add(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1821,7 +1852,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mapping = victim->f_mapping; spin_lock(&swap_lock); - list_for_each_entry(p, &swap_list_head, list) { + plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) { found = 1; @@ -1841,16 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } + spin_lock(&swap_avail_lock); + plist_del(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&p->lock); if (p->prio < 0) { struct swap_info_struct *si = p; - list_for_each_entry_continue(si, &swap_list_head, list) { + plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; + si->list.prio--; + si->avail_list.prio--; } least_priority++; } - list_del_init(&p->list); + plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -2115,7 +2151,8 @@ static struct swap_info_struct *alloc_swap_info(void) */ } INIT_LIST_HEAD(&p->first_swap_extent.list); - INIT_LIST_HEAD(&p->list); + plist_node_init(&p->list, 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); spin_lock_init(&p->lock); -- cgit v1.2.3 From 172cb4b3d49a1339dd67ee05e3f47972a70f556f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Jun 2014 16:10:02 -0700 Subject: mm/dmapool.c: reuse devres_release() to free resources Instead of calling an additional routine in dmam_pool_destroy() rely on what dmam_pool_release() is doing. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index a3a1bfe91110..306baa594f95 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -500,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); - dma_pool_destroy(pool); + WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool)); } EXPORT_SYMBOL(dmam_pool_destroy); -- cgit v1.2.3 From 776ed0f0377914d1e65fed903c052e9eef3f4cc3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:10:02 -0700 Subject: memcg: cleanup kmem cache creation/destruction functions naming Current names are rather inconsistent. Let's try to improve them. Brief change log: ** old name ** ** new name ** kmem_cache_create_memcg memcg_create_kmem_cache memcg_kmem_create_cache memcg_regsiter_cache memcg_kmem_destroy_cache memcg_unregister_cache kmem_cache_destroy_memcg_children memcg_cleanup_cache_params mem_cgroup_destroy_all_caches memcg_unregister_all_caches create_work memcg_register_cache_work memcg_create_cache_work_func memcg_register_cache_func memcg_create_cache_enqueue memcg_schedule_register_cache Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 60 ++++++++++++++++++++++++++------------------------------ mm/slab_common.c | 12 ++++++------ 2 files changed, 34 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5e2bfcc96da9..d176edb1d5e8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3132,8 +3132,8 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -static void memcg_kmem_create_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) +static void memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by memcg_slab_mutex */ @@ -3153,7 +3153,7 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, return; cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); - cachep = kmem_cache_create_memcg(memcg, root_cache, memcg_name_buf); + cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root @@ -3175,7 +3175,7 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, root_cache->memcg_params->memcg_caches[id] = cachep; } -static void memcg_kmem_destroy_cache(struct kmem_cache *cachep) +static void memcg_unregister_cache(struct kmem_cache *cachep) { struct kmem_cache *root_cache; struct mem_cgroup *memcg; @@ -3228,7 +3228,7 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s) { struct kmem_cache *c; int i, failed = 0; @@ -3239,7 +3239,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) if (!c) continue; - memcg_kmem_destroy_cache(c); + memcg_unregister_cache(c); if (cache_from_memcg_idx(s, i)) failed++; @@ -3248,7 +3248,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) return failed; } -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; struct memcg_cache_params *params, *tmp; @@ -3261,25 +3261,26 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) cachep = memcg_params_to_cache(params); kmem_cache_shrink(cachep); if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - memcg_kmem_destroy_cache(cachep); + memcg_unregister_cache(cachep); } mutex_unlock(&memcg_slab_mutex); } -struct create_work { +struct memcg_register_cache_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_create_cache_work_func(struct work_struct *w) +static void memcg_register_cache_func(struct work_struct *w) { - struct create_work *cw = container_of(w, struct create_work, work); + struct memcg_register_cache_work *cw = + container_of(w, struct memcg_register_cache_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; mutex_lock(&memcg_slab_mutex); - memcg_kmem_create_cache(memcg, cachep); + memcg_register_cache(memcg, cachep); mutex_unlock(&memcg_slab_mutex); css_put(&memcg->css); @@ -3289,12 +3290,12 @@ static void memcg_create_cache_work_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct create_work *cw; + struct memcg_register_cache_work *cw; - cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (cw == NULL) { css_put(&memcg->css); return; @@ -3303,17 +3304,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; - INIT_WORK(&cw->work, memcg_create_cache_work_func); + INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_create_cache_enqueue will recurse. + * in __memcg_schedule_register_cache will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -3322,7 +3323,7 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ memcg_stop_kmem_account(); - __memcg_create_cache_enqueue(memcg, cachep); + __memcg_schedule_register_cache(memcg, cachep); memcg_resume_kmem_account(); } @@ -3393,16 +3394,11 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * * However, there are some clashes that can arrive from locking. * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. + * memcg_create_kmem_cache, this means no further allocation + * could happen with the slab_mutex held. So it's better to + * defer everything. */ - memcg_create_cache_enqueue(memcg, cachep); + memcg_schedule_register_cache(memcg, cachep); return cachep; out: rcu_read_unlock(); @@ -3526,7 +3522,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -6372,7 +6368,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) css_for_each_descendant_post(iter, css) mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - mem_cgroup_destroy_all_caches(memcg); + memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 32175617cb75..48fafb61f35e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -261,7 +261,7 @@ EXPORT_SYMBOL(kmem_cache_create); #ifdef CONFIG_MEMCG_KMEM /* - * kmem_cache_create_memcg - Create a cache for a memory cgroup. + * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. * @memcg_name: The name of the memory cgroup (used for naming the new cache). @@ -270,7 +270,7 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, +struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache, const char *memcg_name) { @@ -305,7 +305,7 @@ out_unlock: return s; } -static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +static int memcg_cleanup_cache_params(struct kmem_cache *s) { int rc; @@ -314,13 +314,13 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) return 0; mutex_unlock(&slab_mutex); - rc = __kmem_cache_destroy_memcg_children(s); + rc = __memcg_cleanup_cache_params(s); mutex_lock(&slab_mutex); return rc; } #else -static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +static int memcg_cleanup_cache_params(struct kmem_cache *s) { return 0; } @@ -343,7 +343,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - if (kmem_cache_destroy_memcg_children(s) != 0) + if (memcg_cleanup_cache_params(s) != 0) goto out_unlock; list_del(&s->list); -- cgit v1.2.3 From 0bd62b1190607e4f1b3c2927ba48672a1cf2a83d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:10:03 -0700 Subject: slab: delete cache from list after __kmem_cache_shutdown succeeds Currently, on kmem_cache_destroy we delete the cache from the slab_list before __kmem_cache_shutdown, inserting it back to the list on failure. Initially, this was done, because we could release the slab_mutex in __kmem_cache_shutdown to delete sysfs slub entry, but since commit 41a212859a4d ("slub: use sysfs'es release mechanism for kmem_cache") we remove sysfs entry later in kmem_cache_destroy after dropping the slab_mutex, so that no implementation of __kmem_cache_shutdown can ever release the lock. Therefore we can simplify the code a bit by moving list_del after __kmem_cache_shutdown. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 48fafb61f35e..735e01a0db6f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -346,15 +346,15 @@ void kmem_cache_destroy(struct kmem_cache *s) if (memcg_cleanup_cache_params(s) != 0) goto out_unlock; - list_del(&s->list); if (__kmem_cache_shutdown(s) != 0) { - list_add(&s->list, &slab_caches); printk(KERN_ERR "kmem_cache_destroy %s: " "Slab cache still has objects\n", s->name); dump_stack(); goto out_unlock; } + list_del(&s->list); + mutex_unlock(&slab_mutex); if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); -- cgit v1.2.3 From 65bb371984d6a2c909244eb749e482bb40b72e36 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:05 -0700 Subject: mm: page_alloc: do not update zlc unless the zlc is active The zlc is used on NUMA machines to quickly skip over zones that are full. However it is always updated, even for the first zone scanned when the zlc might not even be active. As it's a write to a bitmap that potentially bounces cache line it's deceptively expensive and most machines will not care. Only update the zlc if it was active. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d88d67584765..8e766241cf56 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2035,7 +2035,7 @@ try_this_zone: if (page) break; this_zone_full: - if (IS_ENABLED(CONFIG_NUMA)) + if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } -- cgit v1.2.3 From 800a1e750c7b04c2aa2459afca77e936e01c0029 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:06 -0700 Subject: mm: page_alloc: do not treat a zone that cannot be used for dirty pages as "full" If a zone cannot be used for a dirty page then it gets marked "full" which is cached in the zlc and later potentially skipped by allocation requests that have nothing to do with dirty zones. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e766241cf56..b4381eaee715 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1967,7 +1967,7 @@ zonelist_scan: */ if ((alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) - goto this_zone_full; + continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, -- cgit v1.2.3 From 664eeddeef6539247691197c1ac124d4aa872ab6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:08 -0700 Subject: mm: page_alloc: use jump labels to avoid checking number_of_cpusets If cpusets are not in use then we still check a global variable on every page allocation. Use jump labels to avoid the overhead. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b4381eaee715..a2955e101715 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1921,7 +1921,8 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); -- cgit v1.2.3 From d34c5fa06fade08a689fc171bf756fba2858ae73 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:10 -0700 Subject: mm: page_alloc: only check the zone id check if pages are buddies A node/zone index is used to check if pages are compatible for merging but this happens unconditionally even if the buddy page is not free. Defer the calculation as long as possible. Ideally we would check the zone boundary but nodes can overlap. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2955e101715..da526905b4a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -509,16 +509,26 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (!pfn_valid_within(page_to_pfn(buddy))) return 0; - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + /* + * zone check is done late to avoid uselessly + * calculating zone/node ids for pages that could + * never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } return 0; -- cgit v1.2.3 From a6e21b14f22041382e832d30deda6f26f37b1097 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:12 -0700 Subject: mm: page_alloc: only check the alloc flags and gfp_mask for dirty once Currently it's calculated once per zone in the zonelist. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index da526905b4a5..30f327a720fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1917,6 +1917,8 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); classzone_idx = zone_idx(preferred_zone); zonelist_scan: @@ -1976,8 +1978,7 @@ zonelist_scan: * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) + if (consider_zone_dirty && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; -- cgit v1.2.3 From 5dab29113ca56335c78be3f98bf5ddf2ef8eb6a6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:14 -0700 Subject: mm: page_alloc: take the ALLOC_NO_WATERMARK check out of the fast path ALLOC_NO_WATERMARK is set in a few cases. Always by kswapd, always for __GFP_MEMALLOC, sometimes for swap-over-nfs, tasks etc. Each of these cases are relatively rare events but the ALLOC_NO_WATERMARK check is an unlikely branch in the fast path. This patch moves the check out of the fast path and after it has been determined that the watermarks have not been met. This helps the common fast path at the cost of making the slow path slower and hitting kswapd with a performance cost. It's a reasonable tradeoff. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30f327a720fd..485932c577e7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1937,9 +1937,6 @@ zonelist_scan: (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) - goto try_this_zone; /* * Distribute pages in proportion to the individual * zone size to ensure fair page aging. The zone a @@ -1986,6 +1983,11 @@ zonelist_scan: classzone_idx, alloc_flags)) { int ret; + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* -- cgit v1.2.3 From e58469bafd0524e848c3733bc3918d854595e20f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:16 -0700 Subject: mm: page_alloc: use word-based accesses for get/set pageblock bitmaps The test_bit operations in get/set pageblock flags are expensive. This patch reads the bitmap on a word basis and use shifts and masks to isolate the bits of interest. Similarly masks are used to set a local copy of the bitmap and then use cmpxchg to update the bitmap if there have been no other changes made in parallel. In a test running dd onto tmpfs the overhead of the pageblock-related functions went from 1.27% in profiles to 0.5%. In addition to the performance benefits, this patch closes races that are possible between: a) get_ and set_pageblock_migratetype(), where get_pageblock_migratetype() reads part of the bits before and other part of the bits after set_pageblock_migratetype() has updated them. b) set_pageblock_migratetype() and set_pageblock_skip(), where the non-atomic read-modify-update set bit operation in set_pageblock_skip() will cause lost updates to some bits changed in the set_pageblock_migratetype(). Joonsoo Kim first reported the case a) via code inspection. Vlastimil Babka's testing with a debug patch showed that either a) or b) occurs roughly once per mmtests' stress-highalloc benchmark (although not necessarily in the same pageblock). Furthermore during development of unrelated compaction patches, it was observed that frequent calls to {start,undo}_isolate_page_range() the race occurs several thousands of times and has resulted in NULL pointer dereferences in move_freepages() and free_one_page() in places where free_list[migratetype] is manipulated by e.g. list_move(). Further debugging confirmed that migratetype had invalid value of 6, causing out of bounds access to the free_list array. That confirmed that the race exist, although it may be extremely rare, and currently only fatal where page isolation is performed due to memory hot remove. Races on pageblocks being updated by set_pageblock_migratetype(), where both old and new migratetype are lower MIGRATE_RESERVE, currently cannot result in an invalid value being observed, although theoretically they may still lead to unexpected creation or destruction of MIGRATE_RESERVE pageblocks. Furthermore, things could get suddenly worse when memory isolation is used more, or when new migratetypes are added. After this patch, the race has no longer been observed in testing. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Reported-by: Joonsoo Kim Reported-and-tested-by: Vlastimil Babka Cc: Johannes Weiner Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 52 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 485932c577e7..6e937809c87a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6028,53 +6028,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) * @end_bitidx: The last bit of interest * returns pageblock_bits flags */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) +unsigned long get_pageblock_flags_mask(struct page *page, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long flags = 0; - unsigned long value = 1; + unsigned long pfn, bitidx, word_bitidx; + unsigned long word; zone = page_zone(page); pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (test_bit(bitidx + start_bitidx, bitmap)) - flags |= value; - - return flags; + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; } /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @start_bitidx: The first bit of interest * @end_bitidx: The last bit of interest * @flags: The flags to set */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx) +void set_pageblock_flags_mask(struct page *page, unsigned long flags, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long value = 1; + unsigned long pfn, bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (flags & value) - __set_bit(bitidx + start_bitidx, bitmap); - else - __clear_bit(bitidx + start_bitidx, bitmap); + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = ACCESS_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } } /* -- cgit v1.2.3 From dc4b0caff24d9b2918e9f27bc65499ee63187eba Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:17 -0700 Subject: mm: page_alloc: reduce number of times page_to_pfn is called In the free path we calculate page_to_pfn multiple times. Reduce that. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Acked-by: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e937809c87a..6cadc8678e28 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -560,6 +560,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, + unsigned long pfn, struct zone *zone, unsigned int order, int migratetype) { @@ -576,7 +577,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(migratetype == -1); - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); @@ -711,7 +712,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, list_del(&page->lru); mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); @@ -723,13 +724,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + int order, int migratetype) { spin_lock(&zone->lock); zone->pages_scanned = 0; - __free_one_page(page, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); @@ -766,15 +769,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; + unsigned long pfn = page_to_pfn(page); if (!free_pages_prepare(page, order)) return; local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); - free_one_page(page_zone(page), page, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -1380,12 +1384,13 @@ void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pages_prepare(page, 0)) return; - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -1399,7 +1404,7 @@ void free_hot_cold_page(struct page *page, int cold) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype); goto out; } migratetype = MIGRATE_MOVABLE; @@ -6028,17 +6033,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) * @end_bitidx: The last bit of interest * returns pageblock_bits flags */ -unsigned long get_pageblock_flags_mask(struct page *page, +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx, word_bitidx; + unsigned long bitidx, word_bitidx; unsigned long word; zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; @@ -6050,25 +6054,25 @@ unsigned long get_pageblock_flags_mask(struct page *page, } /** - * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @start_bitidx: The first bit of interest * @end_bitidx: The last bit of interest * @flags: The flags to set */ -void set_pageblock_flags_mask(struct page *page, unsigned long flags, +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx, word_bitidx; + unsigned long bitidx, word_bitidx; unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; -- cgit v1.2.3 From cfc47a2803db42140167b92d991ef04018e162c7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:19 -0700 Subject: mm: page_alloc: lookup pageblock migratetype with IRQs enabled during free get_pageblock_migratetype() is called during free with IRQs disabled. This is unnecessary and disables IRQs for longer than necessary. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Acked-by: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cadc8678e28..ce4d3716214c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -774,9 +774,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) if (!free_pages_prepare(page, order)) return; + migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); -- cgit v1.2.3 From 7aeb09f9104b760fc53c98cb7d20d06640baf9e6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:21 -0700 Subject: mm: page_alloc: use unsigned int for order in more places X86 prefers the use of unsigned types for iterators and there is a tendency to mix whether a signed or unsigned type if used for page order. This converts a number of sites in mm/page_alloc.c to use unsigned int for order where possible. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce4d3716214c..37ef1b87f1f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -409,7 +409,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) return bad; } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, + gfp_t gfp_flags) { int i; @@ -453,7 +454,7 @@ static inline void set_page_guard_flag(struct page *page) { } static inline void clear_page_guard_flag(struct page *page) { } #endif -static inline void set_page_order(struct page *page, int order) +static inline void set_page_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -504,7 +505,7 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, struct page *buddy, - int order) + unsigned int order) { if (!pfn_valid_within(page_to_pfn(buddy))) return 0; @@ -726,7 +727,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, - int order, + unsigned int order, int migratetype) { spin_lock(&zone->lock); @@ -897,7 +898,7 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) { int i; @@ -1108,16 +1109,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; - int current_order; + unsigned int current_order; struct page *page; int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; current_order >= order; - --current_order) { + for (current_order = MAX_ORDER-1; + current_order >= order && current_order <= MAX_ORDER-1; + --current_order) { for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; @@ -1345,7 +1347,7 @@ void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn; unsigned long flags; - int order, t; + unsigned int order, t; struct list_head *curr; if (zone_is_empty(zone)) @@ -1541,8 +1543,8 @@ int split_free_page(struct page *page) */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags, - int migratetype) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) { unsigned long flags; struct page *page; @@ -1691,8 +1693,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags, long free_pages) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags, + long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; @@ -1726,15 +1729,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); @@ -4121,7 +4124,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct zone *zone) { - int order, t; + unsigned int order, t; for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; @@ -6444,7 +6447,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - int order, i; + unsigned int order, i; unsigned long pfn; unsigned long flags; /* find the first valid pfn */ @@ -6496,7 +6499,7 @@ bool is_free_buddy_page(struct page *page) struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; - int order; + unsigned int order; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { -- cgit v1.2.3 From b745bc85f21ea707e4ea1a91948055fa3e72c77b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:22 -0700 Subject: mm: page_alloc: convert hot/cold parameter and immediate callers to bool cold is a bool, make it one. Make the likely case the "if" part of the block instead of the else as according to the optimisation manual this is preferred. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 ++++++++++---------- mm/swap.c | 4 ++-- mm/swap_state.c | 2 +- mm/vmscan.c | 6 +++--- 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ef1b87f1f3..09345ab7fb63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1199,7 +1199,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, int cold) + int migratetype, bool cold) { int i; @@ -1218,7 +1218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(cold == 0)) + if (likely(!cold)) list_add(&page->lru, list); else list_add_tail(&page->lru, list); @@ -1379,9 +1379,9 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -1413,10 +1413,10 @@ void free_hot_cold_page(struct page *page, int cold) } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (cold) - list_add_tail(&page->lru, &pcp->lists[migratetype]); - else + if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = ACCESS_ONCE(pcp->batch); @@ -1431,7 +1431,7 @@ out: /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, int cold) +void free_hot_cold_page_list(struct list_head *list, bool cold) { struct page *page, *next; @@ -1548,7 +1548,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, { unsigned long flags; struct page *page; - int cold = !!(gfp_flags & __GFP_COLD); + bool cold = ((gfp_flags & __GFP_COLD) != 0); again: if (likely(order == 0)) { @@ -2823,7 +2823,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); else __free_pages_ok(page, order); } diff --git a/mm/swap.c b/mm/swap.c index c8d6df556ce6..11ebb9714f49 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); } static void __put_compound_page(struct page *page) @@ -860,7 +860,7 @@ void lru_add_drain_all(void) * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() * will free it. */ -void release_pages(struct page **pages, int nr, int cold) +void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); diff --git a/mm/swap_state.c b/mm/swap_state.c index e76ace30d436..2972eee184a4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) for (i = 0; i < todo; i++) free_swap_cache(pagep[i]); - release_pages(pagep, todo, 0); + release_pages(pagep, todo, false); pagep += todo; nr -= todo; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9253e188000f..494cd632178c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1121,7 +1121,7 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - free_hot_cold_page_list(&free_pages, 1); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1532,7 +1532,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&page_list, 1); + free_hot_cold_page_list(&page_list, true); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1755,7 +1755,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&l_hold, 1); + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP -- cgit v1.2.3 From 07a427884348d38a6fd56fa4d78249c407196650 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:24 -0700 Subject: mm: shmem: avoid atomic operation during shmem_getpage_gfp shmem_getpage_gfp uses an atomic operation to set the SwapBacked field before it's even added to the LRU or visible. This is unnecessary as what could it possible race against? Use an unlocked variant. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 9f70e02111c6..f47fb38c4889 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1132,7 +1132,7 @@ repeat: goto decused; } - SetPageSwapBacked(page); + __SetPageSwapBacked(page); __set_page_locked(page); error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); -- cgit v1.2.3 From e3741b506c5088fa8c911bb5884c430f770fb49d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:26 -0700 Subject: mm: do not use atomic operations when releasing pages There should be no references to it any more and a parallel mark should not be reordered against us. Use non-locked varient to clear page active. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 11ebb9714f49..30b6a37c74af 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -901,7 +901,7 @@ void release_pages(struct page **pages, int nr, bool cold) } /* Clear Active bit in case of parallel mark_page_accessed */ - ClearPageActive(page); + __ClearPageActive(page); list_add(&page->lru, &pages_to_free); } -- cgit v1.2.3 From 6fb81a17d21f2a138b8f424af4cf379f2b694060 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:28 -0700 Subject: mm: do not use unnecessary atomic operations when adding pages to the LRU When adding pages to the LRU we clear the active bit unconditionally. As the page could be reachable from other paths we cannot use unlocked operations without risk of corruption such as a parallel mark_page_accessed. This patch tests if is necessary to clear the active flag before using an atomic operation. This potentially opens a tiny race when PageActive is checked as mark_page_accessed could be called after PageActive was checked. The race already exists but this patch changes it slightly. The consequence is that that the page may be promoted to the active list that might have been left on the inactive list before the patch. It's too tiny a race and too marginal a consequence to always use atomic operations for. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 30b6a37c74af..1fb25f8bb155 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -631,13 +631,15 @@ static void __lru_cache_add(struct page *page) */ void lru_cache_add_anon(struct page *page) { - ClearPageActive(page); + if (PageActive(page)) + ClearPageActive(page); __lru_cache_add(page); } void lru_cache_add_file(struct page *page) { - ClearPageActive(page); + if (PageActive(page)) + ClearPageActive(page); __lru_cache_add(page); } EXPORT_SYMBOL(lru_cache_add_file); -- cgit v1.2.3 From 2457aec63745e235bcafb7ef312b182d8682f0fc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:31 -0700 Subject: mm: non-atomically mark page accessed during page cache allocation where possible aops->write_begin may allocate a new page and make it visible only to have mark_page_accessed called almost immediately after. Once the page is visible the atomic operations are necessary which is noticable overhead when writing to an in-memory filesystem like tmpfs but should also be noticable with fast storage. The objective of the patch is to initialse the accessed information with non-atomic operations before the page is visible. The bulk of filesystems directly or indirectly use grab_cache_page_write_begin or find_or_create_page for the initial allocation of a page cache page. This patch adds an init_page_accessed() helper which behaves like the first call to mark_page_accessed() but may called before the page is visible and can be done non-atomically. The primary APIs of concern in this care are the following and are used by most filesystems. find_get_page find_lock_page find_or_create_page grab_cache_page_nowait grab_cache_page_write_begin All of them are very similar in detail to the patch creates a core helper pagecache_get_page() which takes a flags parameter that affects its behavior such as whether the page should be marked accessed or not. Then old API is preserved but is basically a thin wrapper around this core function. Each of the filesystems are then updated to avoid calling mark_page_accessed when it is known that the VM interfaces have already done the job. There is a slight snag in that the timing of the mark_page_accessed() has now changed so in rare cases it's possible a page gets to the end of the LRU as PageReferenced where as previously it might have been repromoted. This is expected to be rare but it's worth the filesystem people thinking about it in case they see a problem with the timing change. It is also the case that some filesystems may be marking pages accessed that previously did not but it makes sense that filesystems have consistent behaviour in this regard. The test case used to evaulate this is a simple dd of a large file done multiple times with the file deleted on each iterations. The size of the file is 1/10th physical memory to avoid dirty page balancing. In the async case it will be possible that the workload completes without even hitting the disk and will have variable results but highlight the impact of mark_page_accessed for async IO. The sync results are expected to be more stable. The exception is tmpfs where the normal case is for the "IO" to not hit the disk. The test machine was single socket and UMA to avoid any scheduling or NUMA artifacts. Throughput and wall times are presented for sync IO, only wall times are shown for async as the granularity reported by dd and the variability is unsuitable for comparison. As async results were variable do to writback timings, I'm only reporting the maximum figures. The sync results were stable enough to make the mean and stddev uninteresting. The performance results are reported based on a run with no profiling. Profile data is based on a separate run with oprofile running. async dd 3.15.0-rc3 3.15.0-rc3 vanilla accessed-v2 ext3 Max elapsed 13.9900 ( 0.00%) 11.5900 ( 17.16%) tmpfs Max elapsed 0.5100 ( 0.00%) 0.4900 ( 3.92%) btrfs Max elapsed 12.8100 ( 0.00%) 12.7800 ( 0.23%) ext4 Max elapsed 18.6000 ( 0.00%) 13.3400 ( 28.28%) xfs Max elapsed 12.5600 ( 0.00%) 2.0900 ( 83.36%) The XFS figure is a bit strange as it managed to avoid a worst case by sheer luck but the average figures looked reasonable. samples percentage ext3 86107 0.9783 vmlinux-3.15.0-rc4-vanilla mark_page_accessed ext3 23833 0.2710 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed ext3 5036 0.0573 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed ext4 64566 0.8961 vmlinux-3.15.0-rc4-vanilla mark_page_accessed ext4 5322 0.0713 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed ext4 2869 0.0384 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed xfs 62126 1.7675 vmlinux-3.15.0-rc4-vanilla mark_page_accessed xfs 1904 0.0554 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed xfs 103 0.0030 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed btrfs 10655 0.1338 vmlinux-3.15.0-rc4-vanilla mark_page_accessed btrfs 2020 0.0273 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed btrfs 587 0.0079 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed tmpfs 59562 3.2628 vmlinux-3.15.0-rc4-vanilla mark_page_accessed tmpfs 1210 0.0696 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed tmpfs 94 0.0054 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed [akpm@linux-foundation.org: don't run init_page_accessed() against an uninitialised pointer] Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Cc: Peter Zijlstra Tested-by: Prabhakar Lad Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 202 ++++++++++++++++++++++------------------------------------- mm/shmem.c | 6 +- mm/swap.c | 11 ++++ 3 files changed, 91 insertions(+), 128 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 47d235b357a7..0fcd792103f3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -981,26 +981,6 @@ out: } EXPORT_SYMBOL(find_get_entry); -/** - * find_get_page - find and get a page reference - * @mapping: the address_space to search - * @offset: the page index - * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned with an increased refcount. - * - * Otherwise, %NULL is returned. - */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) -{ - struct page *page = find_get_entry(mapping, offset); - - if (radix_tree_exceptional_entry(page)) - page = NULL; - return page; -} -EXPORT_SYMBOL(find_get_page); - /** * find_lock_entry - locate, pin and lock a page cache entry * @mapping: the address_space to search @@ -1038,66 +1018,84 @@ repeat: EXPORT_SYMBOL(find_lock_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * pagecache_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index + * @fgp_flags: PCG flags + * @gfp_mask: gfp mask to use if a page is to be allocated * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. - * - * Otherwise, %NULL is returned. - * - * find_lock_page() may sleep. - */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) -{ - struct page *page = find_lock_entry(mapping, offset); - - if (radix_tree_exceptional_entry(page)) - page = NULL; - return page; -} -EXPORT_SYMBOL(find_lock_page); - -/** - * find_or_create_page - locate or add a pagecache page - * @mapping: the page's address_space - * @index: the page's index into the mapping - * @gfp_mask: page allocation mode + * Looks up the page cache slot at @mapping & @offset. * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. + * PCG flags modify how the page is returned * - * If the page is not present, a new page is allocated using @gfp_mask - * and added to the page cache and the VM's LRU list. The page is - * returned locked and with an increased refcount. + * FGP_ACCESSED: the page will be marked accessed + * FGP_LOCK: Page is return locked + * FGP_CREAT: If page is not present then a new page is allocated using + * @gfp_mask and added to the page cache and the VM's LRU + * list. The page is returned locked and with an increased + * refcount. Otherwise, %NULL is returned. * - * On memory exhaustion, %NULL is returned. + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even + * if the GFP flags specified for FGP_CREAT are atomic. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an - * atomic allocation! + * If there is a page cache page, it is returned with an increased refcount. */ -struct page *find_or_create_page(struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask) +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, + int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) { struct page *page; - int err; + repeat: - page = find_lock_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp_mask); + page = find_get_entry(mapping, offset); + if (radix_tree_exceptional_entry(page)) + page = NULL; + if (!page) + goto no_page; + + if (fgp_flags & FGP_LOCK) { + if (fgp_flags & FGP_NOWAIT) { + if (!trylock_page(page)) { + page_cache_release(page); + return NULL; + } + } else { + lock_page(page); + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON_PAGE(page->index != offset, page); + } + + if (page && (fgp_flags & FGP_ACCESSED)) + mark_page_accessed(page); + +no_page: + if (!page && (fgp_flags & FGP_CREAT)) { + int err; + if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + cache_gfp_mask |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) { + cache_gfp_mask &= ~__GFP_FS; + radix_gfp_mask &= ~__GFP_FS; + } + + page = __page_cache_alloc(cache_gfp_mask); if (!page) return NULL; - /* - * We want a regular kernel memory (not highmem or DMA etc) - * allocation for the radix tree nodes, but we need to honour - * the context-specific requirements the caller has asked for. - * GFP_RECLAIM_MASK collects those requirements. - */ - err = add_to_page_cache_lru(page, mapping, index, - (gfp_mask & GFP_RECLAIM_MASK)); + + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + fgp_flags |= FGP_LOCK; + + /* Init accessed so avoit atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + init_page_accessed(page); + + err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -1105,9 +1103,10 @@ repeat: goto repeat; } } + return page; } -EXPORT_SYMBOL(find_or_create_page); +EXPORT_SYMBOL(pagecache_get_page); /** * find_get_entries - gang pagecache lookup @@ -1404,39 +1403,6 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); -/** - * grab_cache_page_nowait - returns locked page at given index in given cache - * @mapping: target address_space - * @index: the page index - * - * Same as grab_cache_page(), but do not wait if the page is unavailable. - * This is intended for speculative data generators, where the data can - * be regenerated if the page couldn't be grabbed. This routine should - * be safe to call while holding the lock for another page. - * - * Clear __GFP_FS when allocating the page to avoid recursion into the fs - * and deadlock against the caller's locked page. - */ -struct page * -grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) -{ - struct page *page = find_get_page(mapping, index); - - if (page) { - if (trylock_page(page)) - return page; - page_cache_release(page); - return NULL; - } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { - page_cache_release(page); - page = NULL; - } - return page; -} -EXPORT_SYMBOL(grab_cache_page_nowait); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -2406,7 +2372,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, { const struct address_space_operations *aops = mapping->a_ops; - mark_page_accessed(page); return aops->write_end(file, mapping, pos, len, copied, page, fsdata); } EXPORT_SYMBOL(pagecache_write_end); @@ -2488,34 +2453,18 @@ EXPORT_SYMBOL(generic_file_direct_write); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { - int status; - gfp_t gfp_mask; struct page *page; - gfp_t gfp_notmask = 0; + int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; - gfp_mask = mapping_gfp_mask(mapping); - if (mapping_cap_account_dirty(mapping)) - gfp_mask |= __GFP_WRITE; if (flags & AOP_FLAG_NOFS) - gfp_notmask = __GFP_FS; -repeat: - page = find_lock_page(mapping, index); + fgp_flags |= FGP_NOFS; + + page = pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping), + GFP_KERNEL); if (page) - goto found; + wait_for_stable_page(page); - page = __page_cache_alloc(gfp_mask & ~gfp_notmask); - if (!page) - return NULL; - status = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & ~gfp_notmask); - if (unlikely(status)) { - page_cache_release(page); - if (status == -EEXIST) - goto repeat; - return NULL; - } -found: - wait_for_stable_page(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); @@ -2564,7 +2513,7 @@ again: status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); - if (unlikely(status)) + if (unlikely(status < 0)) break; if (mapping_writably_mapped(mapping)) @@ -2573,7 +2522,6 @@ again: copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); flush_dcache_page(page); - mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) diff --git a/mm/shmem.c b/mm/shmem.c index f47fb38c4889..5402481c28d1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1372,9 +1372,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int ret; struct inode *inode = mapping->host; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + if (ret == 0 && *pagep) + init_page_accessed(*pagep); + return ret; } static int diff --git a/mm/swap.c b/mm/swap.c index 1fb25f8bb155..9e8e3472248b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -614,6 +614,17 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); +/* + * Used to mark_page_accessed(page) that is not visible yet and when it is + * still safe to use non-atomic ops + */ +void init_page_accessed(struct page *page) +{ + if (!PageReferenced(page)) + __SetPageReferenced(page); +} +EXPORT_SYMBOL(init_page_accessed); + static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); -- cgit v1.2.3 From d8846374a85f4290a473a4e2a64c1ba046c4a0e1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:33 -0700 Subject: mm: page_alloc: calculate classzone_idx once from the zonelist ref There is no need to calculate zone_idx(preferred_zone) multiple times or use the pgdat to figure it out. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 59 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 09345ab7fb63..8f785b1534a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1916,11 +1916,10 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int migratetype) + struct zone *preferred_zone, int classzone_idx, int migratetype) { struct zoneref *z; struct page *page = NULL; - int classzone_idx; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ @@ -1928,7 +1927,6 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); - classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. @@ -2186,7 +2184,7 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; @@ -2204,7 +2202,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto out; @@ -2239,7 +2237,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, enum migrate_mode mode, + int classzone_idx, int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2267,7 +2265,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; compaction_defer_reset(preferred_zone, order, true); @@ -2299,7 +2297,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, enum migrate_mode mode, bool *contended_compaction, + int classzone_idx, int migratetype, + enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; @@ -2339,7 +2338,7 @@ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int classzone_idx, int migratetype, unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; @@ -2357,7 +2356,8 @@ retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, + migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2380,14 +2380,14 @@ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -2488,7 +2488,7 @@ static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; @@ -2537,15 +2537,18 @@ restart: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) - first_zones_zonelist(zonelist, high_zoneidx, NULL, - &preferred_zone); + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + struct zoneref *preferred_zoneref; + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, + NULL, &preferred_zone); + classzone_idx = zonelist_zone_idx(preferred_zoneref); + } rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto got_pg; @@ -2560,7 +2563,7 @@ rebalance: page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2591,7 +2594,8 @@ rebalance: */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, - preferred_zone, migratetype, + preferred_zone, + classzone_idx, migratetype, migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); @@ -2621,7 +2625,8 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + classzone_idx, migratetype, + &did_some_progress); if (page) goto got_pg; @@ -2640,7 +2645,7 @@ rebalance: page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, - migratetype); + classzone_idx, migratetype); if (page) goto got_pg; @@ -2681,7 +2686,8 @@ rebalance: */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, - preferred_zone, migratetype, + preferred_zone, + classzone_idx, migratetype, migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); @@ -2708,10 +2714,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; + struct zoneref *preferred_zoneref; struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + int classzone_idx; gfp_mask &= gfp_allowed_mask; @@ -2734,11 +2742,12 @@ retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); if (!preferred_zone) goto out; + classzone_idx = zonelist_zone_idx(preferred_zoneref); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) @@ -2748,7 +2757,7 @@ retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* * The first pass makes sure allocations are spread @@ -2774,7 +2783,7 @@ retry: gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); } trace_mm_page_alloc(page, order, gfp_mask, migratetype); -- cgit v1.2.3 From 888cf2db475a256fb0cda042140f73d7881f81fe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:34 -0700 Subject: mm: avoid unnecessary atomic operations during end_page_writeback() If a page is marked for immediate reclaim then it is moved to the tail of the LRU list. This occurs when the system is under enough memory pressure for pages under writeback to reach the end of the LRU but we test for this using atomic operations on every writeback. This patch uses an optimistic non-atomic test first. It'll miss some pages in rare cases but the consequences are not severe enough to warrant such a penalty. While the function does not dominate profiles during a simple dd test the cost of it is reduced. 73048 0.7428 vmlinux-3.15.0-rc5-mmotm-20140513 end_page_writeback 23740 0.2409 vmlinux-3.15.0-rc5-lessatomic end_page_writeback Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 0fcd792103f3..7fadf1c62838 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -753,8 +753,17 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (TestClearPageReclaim(page)) + /* + * TestClearPageReclaim could be used here but it is an atomic + * operation and overkill in this particular case. Failing to + * shuffle a page marked for immediate reclaim is too mild to + * justify taking an atomic operation penalty at the end of + * ever page writeback. + */ + if (PageReclaim(page)) { + ClearPageReclaim(page); rotate_reclaimable_page(page); + } if (!test_clear_page_writeback(page)) BUG(); -- cgit v1.2.3 From 6edd6cc66201e06a6cc34030462217e7f4d72f4f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:10:35 -0700 Subject: mm/memory-failure.c: move comment The comment about pages under writeback is far from the relevant code, so let's move it to the right place. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3cd1b652821c..a0474680c394 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1132,11 +1132,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } } - /* - * Lock the page and wait for writeback to finish. - * It's very difficult to mess with pages currently under IO - * and in many cases impossible, so we just avoid it here. - */ lock_page(hpage); /* @@ -1186,6 +1181,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHuge(p)) set_page_hwpoison_huge_page(hpage); + /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ wait_on_page_writeback(p); /* -- cgit v1.2.3 From 8f34af6f93aee88291cec53ae8dff4989e58fbbd Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:10:36 -0700 Subject: mm, hugetlb: move the error handle logic out of normal code path alloc_huge_page() now mixes normal code path with error handle logic. This patches move out the error handle logic, to make normal code path more clean and redue code duplicate. Signed-off-by: Jianyu Zhan Acked-by: Davidlohr Bueso Reviewed-by: Michal Hocko Reviewed-by: Aneesh Kumar K.V Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 98f0bc105dfe..244194217e39 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1386,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); - if (ret) { - if (chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); - return ERR_PTR(-ENOSPC); - } + if (ret) + goto out_subpool_put; + spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); - if (!page) { - hugetlb_cgroup_uncharge_cgroup(idx, - pages_per_huge_page(h), - h_cg); - if (chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); - return ERR_PTR(-ENOSPC); - } + if (!page) + goto out_uncharge_cgroup; + spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ @@ -1415,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, vma_commit_reservation(h, vma, addr); return page; + +out_uncharge_cgroup: + hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); +out_subpool_put: + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); + return ERR_PTR(-ENOSPC); } /* -- cgit v1.2.3 From 4be89a34609659042ef0bf883ad76388fb5251bb Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:10:38 -0700 Subject: mm/vmscan.c: use DIV_ROUND_UP for calculation of zone's balance_gap and correct comments. Currently, we use (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO to avoid a zero gap value. It's better to use DIV_ROUND_UP macro for neater code and clear meaning. Besides, the gap value is calculated against the per-zone "managed pages", not "present pages". This patch also corrects the comment and do some rephrasing. Signed-off-by: Jianyu Zhan Acked-by: Rik van Riel Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 494cd632178c..cc29fca8d989 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2295,9 +2295,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); @@ -2949,9 +2948,8 @@ static bool kswapd_shrink_zone(struct zone *zone, * high wmark plus a "gap" where the gap is either the low * watermark or 1% of the zone, whichever is smaller. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); /* * If there is no low memory pressure or the zone is balanced then no -- cgit v1.2.3 From be9765722e6b7ece8263cbab857490332339bd6f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:10:41 -0700 Subject: mm, compaction: properly signal and act upon lock and need_sched() contention Compaction uses compact_checklock_irqsave() function to periodically check for lock contention and need_resched() to either abort async compaction, or to free the lock, schedule and retake the lock. When aborting, cc->contended is set to signal the contended state to the caller. Two problems have been identified in this mechanism. First, compaction also calls directly cond_resched() in both scanners when no lock is yet taken. This call either does not abort async compaction, or set cc->contended appropriately. This patch introduces a new compact_should_abort() function to achieve both. In isolate_freepages(), the check frequency is reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration scanner does in the preliminary page checks. In case a pageblock is found suitable for calling isolate_freepages_block(), the checks within there are done on higher frequency. Second, isolate_freepages() does not check if isolate_freepages_block() aborted due to contention, and advances to the next pageblock. This violates the principle of aborting on contention, and might result in pageblocks not being scanned completely, since the scanning cursor is advanced. This problem has been noticed in the code by Joonsoo Kim when reviewing related patches. This patch makes isolate_freepages_block() check the cc->contended flag and abort. In case isolate_freepages() has already isolated some pages before aborting due to contention, page migration will proceed, which is OK since we do not want to waste the work that has been done, and page migration has own checks for contention. However, we do not want another isolation attempt by either of the scanners, so cc->contended flag check is added also to compaction_alloc() and compact_finished() to make sure compaction is aborted right after the migration. The outcome of the patch should be reduced lock contention by async compaction and lower latencies for higher-order allocations where direct compaction is involved. [akpm@linux-foundation.org: fix typo in comment] Reported-by: Joonsoo Kim Signed-off-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Cc: Minchan Kim Cc: Mel Gorman Cc: Bartlomiej Zolnierkiewicz Cc: Michal Nazarewicz Cc: Christoph Lameter Cc: Rik van Riel Acked-by: Michal Nazarewicz Tested-by: Shawn Guo Tested-by: Kevin Hilman Tested-by: Stephen Warren Tested-by: Fabio Estevam Cc: David Rientjes Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 54 ++++++++++++++++++++++++++++++++++++++++++++---------- mm/internal.h | 5 ++++- 2 files changed, 48 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 58441220b953..21bf292b642a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -222,6 +222,30 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } +/* + * Aside from avoiding lock contention, compaction also periodically checks + * need_resched() and either schedules in sync compaction or aborts async + * compaction. This is similar to what compact_checklock_irqsave() does, but + * is used where no lock is concerned. + * + * Returns false when no scheduling was needed, or sync compaction scheduled. + * Returns true when async compaction should abort. + */ +static inline bool compact_should_abort(struct compact_control *cc) +{ + /* async compaction aborts if contended */ + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) { + cc->contended = true; + return true; + } + + cond_resched(); + } + + return false; +} + /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { @@ -494,11 +518,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, return 0; } - if (cond_resched()) { - /* Async terminates prematurely on need_resched() */ - if (cc->mode == MIGRATE_ASYNC) - return 0; - } + if (compact_should_abort(cc)) + return 0; /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { @@ -720,9 +741,11 @@ static void isolate_freepages(struct zone *zone, /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need - * to schedule. + * to schedule, or even abort async compaction. */ - cond_resched(); + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; if (!pfn_valid(block_start_pfn)) continue; @@ -760,6 +783,13 @@ static void isolate_freepages(struct zone *zone, */ if (isolated) cc->finished_update_free = true; + + /* + * isolate_freepages_block() might have aborted due to async + * compaction being contended + */ + if (cc->contended) + break; } /* split_free_page does not map the pages */ @@ -786,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* Isolate free pages if necessary */ + /* + * Isolate free pages if necessary, and if we are not aborting due to + * contention. + */ if (list_empty(&cc->freepages)) { - isolate_freepages(cc->zone, cc); + if (!cc->contended) + isolate_freepages(cc->zone, cc); if (list_empty(&cc->freepages)) return NULL; @@ -858,7 +892,7 @@ static int compact_finished(struct zone *zone, unsigned int order; unsigned long watermark; - if (fatal_signal_pending(current)) + if (cc->contended || fatal_signal_pending(current)) return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ diff --git a/mm/internal.h b/mm/internal.h index 802c3a4fc03a..7f22a11fcc66 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -144,7 +144,10 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended, or + * need_resched() true during async + * compaction + */ }; unsigned long -- cgit v1.2.3 From 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:10:44 -0700 Subject: mm/msync.c: sync only the requested range in msync() msync() currently syncs more than POSIX requires or BSD or Solaris implement. It is supposed to be equivalent to fdatasync(), not fsync(), and it is only supposed to sync the portion of the file that overlaps the range passed to msync. If the VMA is non-linear, fall back to syncing the entire file, but we still optimise to only fdatasync() the entire file, not the full fsync(). akpm: there are obvious concerns with bck-compatibility: is anyone relying on the undocumented side-effect for their data integrity? And how would they ever know if this change broke their data integrity? We think the risk is reasonably low, and this patch brings the kernel into line with other OS's and with what the manpage has always said... Signed-off-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Acked-by: Jeff Moyer Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index 632df4527c01..a5c673669ca6 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) vma = find_vma(mm, start); for (;;) { struct file *file; + loff_t fstart, fend; /* Still start < end. */ error = -ENOMEM; @@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out_unlock; } file = vma->vm_file; + fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + fend = fstart + (min(end, vma->vm_end) - start) - 1; start = vma->vm_end; if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - error = vfs_fsync(file, 0); + if (vma->vm_flags & VM_NONLINEAR) + error = vfs_fsync(file, 1); + else + error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; -- cgit v1.2.3 From 850e9c69ca75f32aa9361a0edec6cad388a231b0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:45 -0700 Subject: mm: fix typo in comment in do_fault_around() Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7049d394fa07..e7ccbac25b72 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2832,7 +2832,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, /* * max_pgoff is either end of page table or end of vma - * or fault_around_pages() from pgoff, depending what is neast. + * or fault_around_pages() from pgoff, depending what is nearest. */ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; -- cgit v1.2.3 From 1a501907bbea8e6ebb0b16cf6db9e9cbf1d2c813 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:49 -0700 Subject: mm: vmscan: use proportional scanning during direct reclaim and full scan at DEF_PRIORITY Commit "mm: vmscan: obey proportional scanning requirements for kswapd" ensured that file/anon lists were scanned proportionally for reclaim from kswapd but ignored it for direct reclaim. The intent was to minimse direct reclaim latency but Yuanhan Liu pointer out that it substitutes one long stall for many small stalls and distorts aging for normal workloads like streaming readers/writers. Hugh Dickins pointed out that a side-effect of the same commit was that when one LRU list dropped to zero that the entirety of the other list was shrunk leading to excessive reclaim in memcgs. This patch scans the file/anon lists proportionally for direct reclaim to similarly age page whether reclaimed by kswapd or direct reclaim but takes care to abort reclaim if one LRU drops to zero after reclaiming the requested number of pages. Based on ext4 and using the Intel VM scalability test 3.15.0-rc5 3.15.0-rc5 shrinker proportion Unit lru-file-readonce elapsed 5.3500 ( 0.00%) 5.4200 ( -1.31%) Unit lru-file-readonce time_range 0.2700 ( 0.00%) 0.1400 ( 48.15%) Unit lru-file-readonce time_stddv 0.1148 ( 0.00%) 0.0536 ( 53.33%) Unit lru-file-readtwice elapsed 8.1700 ( 0.00%) 8.1700 ( 0.00%) Unit lru-file-readtwice time_range 0.4300 ( 0.00%) 0.2300 ( 46.51%) Unit lru-file-readtwice time_stddv 0.1650 ( 0.00%) 0.0971 ( 41.16%) The test cases are running multiple dd instances reading sparse files. The results are within the noise for the small test machine. The impact of the patch is more noticable from the vmstats 3.15.0-rc5 3.15.0-rc5 shrinker proportion Minor Faults 35154 36784 Major Faults 611 1305 Swap Ins 394 1651 Swap Outs 4394 5891 Allocation stalls 118616 44781 Direct pages scanned 4935171 4602313 Kswapd pages scanned 15921292 16258483 Kswapd pages reclaimed 15913301 16248305 Direct pages reclaimed 4933368 4601133 Kswapd efficiency 99% 99% Kswapd velocity 670088.047 682555.961 Direct efficiency 99% 99% Direct velocity 207709.217 193212.133 Percentage direct scans 23% 22% Page writes by reclaim 4858.000 6232.000 Page writes file 464 341 Page writes anon 4394 5891 Note that there are fewer allocation stalls even though the amount of direct reclaim scanning is very approximately the same. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Hugh Dickins Cc: Tim Chen Cc: Dave Chinner Tested-by: Yuanhan Liu Cc: Bob Liu Cc: Jan Kara Cc: Rik van Riel Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index cc29fca8d989..9149444f947d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2064,13 +2064,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + bool scan_adjusted; get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2090,18 +2104,9 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (nr_reclaimed < nr_to_reclaim || scan_adjusted) continue; - /* - * For global direct reclaim, reclaim only the number of pages - * requested. Less care is taken to scan proportionally as it - * is more important to minimise direct reclaim stall latency - * than it is to properly age the LRU lists. - */ - if (global_reclaim(sc) && !current_is_kswapd()) - break; - /* * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs shrink + * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. @@ -2109,6 +2114,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; -- cgit v1.2.3 From 226b4ccdcb6371645c25ec99b59bfde65987318c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 4 Jun 2014 16:10:50 -0700 Subject: mm/process_vm_access: move config option into init/Kconfig CONFIG_CROSS_MEMORY_ATTACH adds couple syscalls: process_vm_readv and process_vm_writev, it's a kind of IPC for copying data between processes. Currently this option is placed inside "Processor type and features". This patch moves it into "General setup" (where all other arch-independed syscalls and ipc features are placed) and changes prompt string to less cryptic. Signed-off-by: Konstantin Khlebnikov Cc: Christopher Yeoh Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 75ac479cbacd..0f00bffaaf61 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -436,16 +436,6 @@ choice benefit. endchoice -config CROSS_MEMORY_ATTACH - bool "Cross Memory Support" - depends on MMU - default y - help - Enabling this option adds the system calls process_vm_readv and - process_vm_writev which allow a process with the correct privileges - to directly read from or write to to another process's address space. - See the man page for more details. - # # UP and nommu archs use km based percpu allocator # -- cgit v1.2.3 From 3d92860f979f725a9c10c2fc26c0415a4332adbf Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 4 Jun 2014 16:10:51 -0700 Subject: mm/rmap.c: don't call mmu_notifier_invalidate_page() during munlock In its munmap mode, try_to_unmap_one() searches other mlocked vmas, it never unmaps pages. There is no reason for invalidation because ptes are left unchanged. Signed-off-by: Konstantin Khlebnikov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index e375ce4bd93e..ab74290d185d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1252,7 +1252,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL) + if (ret != SWAP_FAIL && TTU_ACTION(flags) != TTU_MUNLOCK) mmu_notifier_invalidate_page(mm, address); out: return ret; -- cgit v1.2.3 From daa5ba768b9e15da8867824d2f1e8d455f1acac2 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 4 Jun 2014 16:10:52 -0700 Subject: mm/rmap.c: cleanup ttu_flags Transform action part of ttu_flags into individiual bits. These flags aren't part of any uses-space visible api or even trace events. Signed-off-by: Konstantin Khlebnikov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index ab74290d185d..ea8e20d75b29 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1162,7 +1162,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) goto out_mlock; - if (TTU_ACTION(flags) == TTU_MUNLOCK) + if (flags & TTU_MUNLOCK) goto out_unmap; } if (!(flags & TTU_IGNORE_ACCESS)) { @@ -1230,7 +1230,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); + BUG_ON(!(flags & TTU_MIGRATION)); entry = make_migration_entry(page, pte_write(pteval)); } swp_pte = swp_entry_to_pte(entry); @@ -1239,7 +1239,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && - (TTU_ACTION(flags) == TTU_MIGRATION)) { + (flags & TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -1252,7 +1252,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL && TTU_ACTION(flags) != TTU_MUNLOCK) + if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1539,7 +1539,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; ret = rmap_walk(page, &rwc); -- cgit v1.2.3 From 7d018176e6d50510b142bccbd60d8c6ed5e72e56 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 4 Jun 2014 16:10:53 -0700 Subject: mm/page_alloc.c: cleanup add_active_range() related comments add_active_range() has been repalced by memblock_set_node(). Clean up the comments to comply with that change. Signed-off-by: Zhang Zhen Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f785b1534a3..a59bdb653958 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4387,9 +4387,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - * Architectures may implement their own version but if add_active_range() - * was used and there are no special requirements, this is a convenient - * alternative */ int __meminit __early_pfn_to_nid(unsigned long pfn) { @@ -4444,10 +4441,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling memblock_free_early_nid() - * manually. + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4469,9 +4465,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * function may be used instead of calling memory_present() manually. + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually. */ void __init sparse_memory_present_with_active_regions(int nid) { @@ -4489,7 +4484,7 @@ void __init sparse_memory_present_with_active_regions(int nid) * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information - * provided by an arch calling add_active_range(). If called for a node + * provided by memblock_set_node(). If called for a node * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ @@ -5066,7 +5061,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) * find_min_pfn_with_active_regions - Find the minimum PFN registered * * It returns the minimum PFN based on information provided via - * add_active_range(). + * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) { @@ -5287,7 +5282,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. - * Using the page ranges provided by add_active_range(), the size of each + * Using the page ranges provided by memblock_set_node(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed -- cgit v1.2.3 From a9b0f8618d46ba027243b8ecb5c2468a7112d235 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:54 -0700 Subject: mm: nominate faultaround area in bytes rather than page order There is evidencs that the faultaround feature is less relevant on architectures with page size bigger then 4k. Which makes sense since page fault overhead per byte of mapped area should be less there. Let's rework the feature to specify faultaround area in bytes instead of page order. It's 64 kilobytes for now. The patch effectively disables faultaround on architectures with page size >= 64k (like ppc64). It's possible that some other size of faultaround area is relevant for a platform. We can expose `fault_around_bytes' variable to arch-specific code once such platforms will be found. Signed-off-by: Kirill A. Shutemov Cc: Rusty Russell Cc: Hugh Dickins Cc: Madhavan Srinivasan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 62 +++++++++++++++++++++++-------------------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e7ccbac25b72..62a08a7badc4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2758,63 +2758,47 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } -#define FAULT_AROUND_ORDER 4 +static unsigned long fault_around_bytes = 65536; + +static inline unsigned long fault_around_pages(void) +{ + return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK; +} -#ifdef CONFIG_DEBUG_FS -static unsigned int fault_around_order = FAULT_AROUND_ORDER; -static int fault_around_order_get(void *data, u64 *val) +#ifdef CONFIG_DEBUG_FS +static int fault_around_bytes_get(void *data, u64 *val) { - *val = fault_around_order; + *val = fault_around_bytes; return 0; } -static int fault_around_order_set(void *data, u64 val) +static int fault_around_bytes_set(void *data, u64 val) { - BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); - if (1UL << val > PTRS_PER_PTE) + if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; - fault_around_order = val; + fault_around_bytes = val; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, - fault_around_order_get, fault_around_order_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, + fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { void *ret; - ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, - &fault_around_order_fops); + ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); if (!ret) - pr_warn("Failed to create fault_around_order in debugfs"); + pr_warn("Failed to create fault_around_bytes in debugfs"); return 0; } late_initcall(fault_around_debugfs); - -static inline unsigned long fault_around_pages(void) -{ - return 1UL << fault_around_order; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); -} -#else -static inline unsigned long fault_around_pages(void) -{ - unsigned long nr_pages; - - nr_pages = 1UL << FAULT_AROUND_ORDER; - BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); - return nr_pages; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); -} #endif static void do_fault_around(struct vm_area_struct *vma, unsigned long address, @@ -2871,7 +2855,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages) { + if (vma->vm_ops->map_pages && fault_around_pages() > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) -- cgit v1.2.3 From 1fdb412bd825998efbced3a16f6ce7e0329728cf Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:55 -0700 Subject: mm: document do_fault_around() feature Some clarification on how faultaround works. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 62a08a7badc4..d67fd9fcf1f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2760,6 +2760,10 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, static unsigned long fault_around_bytes = 65536; +/* + * fault_around_pages() and fault_around_mask() round down fault_around_bytes + * to nearest page order. It's what do_fault_around() expects to see. + */ static inline unsigned long fault_around_pages(void) { return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE; @@ -2801,6 +2805,29 @@ static int __init fault_around_debugfs(void) late_initcall(fault_around_debugfs); #endif +/* + * do_fault_around() tries to map few pages around the fault address. The hope + * is that the pages will be needed soon and this will lower the number of + * faults to handle. + * + * It uses vm_ops->map_pages() to map the pages, which skips the page if it's + * not ready to be mapped: not up-to-date, locked, etc. + * + * This function is called with the page table lock taken. In the split ptlock + * case the page table lock only protects only those entries which belong to + * the page table corresponding to the fault address. + * + * This function doesn't cross the VMA boundaries, in order to call map_pages() + * only once. + * + * fault_around_pages() defines how many pages we'll try to map. + * do_fault_around() expects it to return a power of two less than or equal to + * PTRS_PER_PTE. + * + * The virtual address of the area that we map is naturally aligned to the + * fault_around_pages() value (and therefore to page order). This way it's + * easier to guarantee that we don't cross page table boundaries. + */ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) { -- cgit v1.2.3 From 100873d7a777b67ad35197c5a998b5e778f8bf3f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:10:56 -0700 Subject: hugetlb: rename hugepage_migration_support() to ..._supported() We already have a function named hugepages_supported(), and the similar name hugepage_migration_support() is a bit unconfortable, so let's rename it hugepage_migration_supported(). Signed-off-by: Naoya Horiguchi Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- mm/migrate.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 244194217e39..226910cb7c9b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -544,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_support(h)) + if (hugepages_treat_as_movable || hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; diff --git a/mm/migrate.c b/mm/migrate.c index 2a459675eeab..63f0cd559999 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1039,7 +1039,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * tables or check whether the hugepage is pmd-based or not before * kicking migration. */ - if (!hugepage_migration_support(page_hstate(hpage))) { + if (!hugepage_migration_supported(page_hstate(hpage))) { putback_active_hugepage(hpage); return -ENOSYS; } -- cgit v1.2.3 From 50088c440910730baf3248acfad2c846fb3eea77 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Wed, 4 Jun 2014 16:10:57 -0700 Subject: mm/swapfile.c: delete the "last_in_cluster < scan_base" loop in the body of scan_swap_map() Via commit ebc2a1a69111 ("swap: make cluster allocation per-cpu"), we can find that all SWP_SOLIDSTATE "seek is cheap"(SSD case) has already gone to si->cluster_info scan_swap_map_try_ssd_cluster() route. So that the "last_in_cluster < scan_base" loop in the body of scan_swap_map() has already become a dead code snippet, and it should have been deleted. This patch is to delete the redundant loop as Hugh and Shaohua suggested. [hughd@google.com: fix comment, simplify code] Signed-off-by: Chen Yucong Cc: Shaohua Li Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index beeeef8a1b2d..4c524f7bd0bf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -523,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * But if seek is cheap, search from our current position, so - * that swap is allocated from all over the partition: if the - * Flash Translation Layer only remaps within limited zones, - * we don't want to wear out the first zone too quickly. + * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info + * case, just handled by scan_swap_map_try_ssd_cluster() above. */ - if (!(si->flags & SWP_SOLIDSTATE)) - scan_base = offset = si->lowest_bit; + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -549,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } } - offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster < scan_base; offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; -- cgit v1.2.3 From d2f3102838d90ed6ed09a6154bdb2306f7cf1548 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:10:58 -0700 Subject: mm/page-writeback.c: remove outdated comment There is an orphaned prehistoric comment , which used to be against get_dirty_limits(), the dawn of global_dirtyable_memory(). Back then, the implementation of get_dirty_limits() is complicated and full of magic numbers, so this comment is necessary. But we now use the clear and neat global_dirtyable_memory(), which renders this comment ambiguous and useless. Remove it. Signed-off-by: Jianyu Zhan Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b9b8e8204628..533fa60c9ac1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -155,24 +155,6 @@ static unsigned long writeout_period_time = 0; */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of -- cgit v1.2.3 From a70ffcac741d31a406c1d2b832ae43d658e7e1cf Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 4 Jun 2014 16:10:59 -0700 Subject: mm/memory-failure.c-failure: send right signal code to correct thread When a thread in a multi-threaded application hits a machine check because of an uncorrectable error in memory - we want to send the SIGBUS with si.si_code = BUS_MCEERR_AR to that thread. Currently we fail to do that if the active thread is not the primary thread in the process. collect_procs() just finds primary threads and this test: if ((flags & MF_ACTION_REQUIRED) && t == current) { will see that the thread we found isn't the current thread and so send a si.si_code = BUS_MCEERR_AO to the primary (and nothing to the active thread at this time). We can fix this by checking whether "current" shares the same mm with the process that collect_procs() said owned the page. If so, we send the SIGBUS to current (with code BUS_MCEERR_AR). Signed-off-by: Tony Luck Signed-off-by: Naoya Horiguchi Reported-by: Otto Bruggeman Cc: Andi Kleen Cc: Borislav Petkov Cc: Chen Gong Cc: [3.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a0474680c394..89ad452182bb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -204,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #endif si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; - if ((flags & MF_ACTION_REQUIRED) && t == current) { + if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { si.si_code = BUS_MCEERR_AR; - ret = force_sig_info(SIGBUS, &si, t); + ret = force_sig_info(SIGBUS, &si, current); } else { /* * Don't use force here, it's convenient if the signal -- cgit v1.2.3 From 74614de17db6fb472370c426d4f934d8d616edf2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 4 Jun 2014 16:11:01 -0700 Subject: mm/memory-failure.c: don't let collect_procs() skip over processes for MF_ACTION_REQUIRED When Linux sees an "action optional" machine check (where h/w has reported an error that is not in the current execution path) we generally do not want to signal a process, since most processes do not have a SIGBUS handler - we'd just prematurely terminate the process for a problem that they might never actually see. task_early_kill() decides whether to consider a process - and it checks whether this specific process has been marked for early signals with "prctl", or if the system administrator has requested early signals for all processes using /proc/sys/vm/memory_failure_early_kill. But for MF_ACTION_REQUIRED case we must not defer. The error is in the execution path of the current thread so we must send the SIGBUS immediatley. Fix by passing a flag argument through collect_procs*() to task_early_kill() so it knows whether we can defer or must take action. Signed-off-by: Tony Luck Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Borislav Petkov Cc: Chen Gong Cc: [3.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 89ad452182bb..ed339c505d55 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -380,10 +380,12 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk) +static int task_early_kill(struct task_struct *tsk, int force_early) { if (!tsk->mm) return 0; + if (force_early) + return 1; if (tsk->flags & PF_MCE_PROCESS) return !!(tsk->flags & PF_MCE_EARLY); return sysctl_memory_failure_early_kill; @@ -393,7 +395,7 @@ static int task_early_kill(struct task_struct *tsk) * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -409,7 +411,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, for_each_process (tsk) { struct anon_vma_chain *vmac; - if (!task_early_kill(tsk)) + if (!task_early_kill(tsk, force_early)) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { @@ -428,7 +430,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -439,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - if (!task_early_kill(tsk)) + if (!task_early_kill(tsk, force_early)) continue; vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, @@ -465,7 +467,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * First preallocate one tokill structure outside the spin locks, * so that we can kill at least one process reasonably reliable. */ -static void collect_procs(struct page *page, struct list_head *tokill) +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) { struct to_kill *tk; @@ -476,9 +479,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) if (!tk) return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); + collect_procs_anon(page, tokill, &tk, force_early); else - collect_procs_file(page, tokill, &tk); + collect_procs_file(page, tokill, &tk, force_early); kfree(tk); } @@ -963,7 +966,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill); + collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) -- cgit v1.2.3 From 3ba08129e38437561df44c36b7ea9081185d5333 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:11:02 -0700 Subject: mm/memory-failure.c: support use of a dedicated thread to handle SIGBUS(BUS_MCEERR_AO) Currently memory error handler handles action optional errors in the deferred manner by default. And if a recovery aware application wants to handle it immediately, it can do it by setting PF_MCE_EARLY flag. However, such signal can be sent only to the main thread, so it's problematic if the application wants to have a dedicated thread to handler such signals. So this patch adds dedicated thread support to memory error handler. We have PF_MCE_EARLY flags for each thread separately, so with this patch AO signal is sent to the thread with PF_MCE_EARLY flag set, not the main thread. If you want to implement a dedicated thread, you call prctl() to set PF_MCE_EARLY on the thread. Memory error handler collects processes to be killed, so this patch lets it check PF_MCE_EARLY flag on each thread in the collecting routines. No behavioral change for all non-early kill cases. Tony said: : The old behavior was crazy - someone with a multithreaded process might : well expect that if they call prctl(PF_MCE_EARLY) in just one thread, then : that thread would see the SIGBUS with si_code = BUS_MCEERR_A0 - even if : that thread wasn't the main thread for the process. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Naoya Horiguchi Reviewed-by: Tony Luck Cc: Kamil Iskra Cc: Andi Kleen Cc: Borislav Petkov Cc: Chen Gong Cc: [3.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 56 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ed339c505d55..cd8989c1027e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -380,15 +380,44 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk, int force_early) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) +{ + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) { + struct task_struct *t; if (!tsk->mm) - return 0; + return NULL; if (force_early) - return 1; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* @@ -410,16 +439,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk, force_early)) + if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -440,10 +470,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, read_lock(&tasklist_lock); for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk, force_early)) + if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* @@ -453,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); -- cgit v1.2.3 From 2a7a0e0fdc49a08740a69d51ef44ef09763072b0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 4 Jun 2014 16:11:04 -0700 Subject: mm, memcg: periodically schedule when emptying page list mem_cgroup_force_empty_list() can iterate a large number of pages on an lru and mem_cgroup_move_parent() doesn't return an errno unless certain criteria, none of which indicate that the iteration may be taking too long, is met. We have encountered the following stack trace many times indicating "need_resched set for > 51000020 ns (51 ticks) without schedule", for example: scheduler_tick() mem_cgroup_move_account+0x4d/0x1d5 mem_cgroup_move_parent+0x8d/0x109 mem_cgroup_reparent_charges+0x149/0x2ba mem_cgroup_css_offline+0xeb/0x11b cgroup_offline_fn+0x68/0x16b process_one_work+0x129/0x350 If this iteration is taking too long, we still need to do cond_resched() even when an individual page is not busy. [rientjes@google.com: changelog] Signed-off-by: Hugh Dickins Signed-off-by: David Rientjes Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d176edb1d5e8..a500cb0594c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4675,9 +4675,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (mem_cgroup_move_parent(page, pc, memcg)) { /* found lock contention or "pc" is obsolete. */ busy = page; - cond_resched(); } else busy = NULL; + cond_resched(); } while (!list_empty(list)); } -- cgit v1.2.3 From 50417c55562c03e6746b13aee650c2bbb048fea3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:07 -0700 Subject: mm/zbud.c: make size unsigned like unique callsite zbud_alloc is only called by zswap_frontswap_store with unsigned int len. Change function parameter + update >= 0 check. Signed-off-by: Fabian Frederick Acked-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zbud.c b/mm/zbud.c index 9451361e6aa7..01df13a7e2e1 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; @@ -255,7 +255,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, enum buddy bud; struct page *page; - if (size <= 0 || gfp & __GFP_HIGHMEM) + if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) return -ENOSPC; -- cgit v1.2.3 From 7eb52512a977854eca51d9b692c2f3be8a0e5eeb Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 4 Jun 2014 16:11:08 -0700 Subject: zsmalloc: fixup trivial zs size classes value in comments According to calculation, ZS_SIZE_CLASSES value is 255 on systems with 4K page size, not 254. The old value may forget count the ZS_MIN_ALLOC_SIZE in. This patch fixes this trivial issue in the comments. Signed-off-by: Weijie Yang Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5ae5d85b629d..fe78189624cf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -141,7 +141,7 @@ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* - * On systems with 4K page size, this gives 254 size classes! There is a + * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes -- cgit v1.2.3 From 93ef6d6ca11382eff03812797da457bc176653a4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 4 Jun 2014 16:11:09 -0700 Subject: mm/vmalloc.c: export unmap_kernel_range() zsmalloc needs exported unmap_kernel_range for building as a module. See https://lkml.org/lkml/2013/1/18/487 I didn't send a patch to make unmap_kernel_range exportable at that time because zram was staging stuff and I thought VM function exporting for staging stuff makes no sense. Now zsmalloc was promoted. If we can't build zsmalloc as module, it means we can't build zram as module, either. Additionally, buddy map_vm_area is already exported so let's export unmap_kernel_range to help his buddy. Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2ed75fb89fc1..f64632b67196 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1268,6 +1268,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } +EXPORT_SYMBOL_GPL(unmap_kernel_range); int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { -- cgit v1.2.3 From d867f203b974e9a670028fda909ef09044b221f6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 4 Jun 2014 16:11:10 -0700 Subject: mm/zsmalloc: make zsmalloc module-buildable Now, we can build zsmalloc as module because unmap_kernel_range was exported. Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 0f00bffaaf61..3e9977a9d657 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -551,7 +551,7 @@ config MEM_SOFT_DIRTY See Documentation/vm/soft-dirty.txt for more details. config ZSMALLOC - bool "Memory allocator for compressed pages" + tristate "Memory allocator for compressed pages" depends on MMU default n help -- cgit v1.2.3 From 72d09633c9afa02bea317d65eb8b8a4ce7659a2a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Jun 2014 16:11:11 -0700 Subject: mm/zswap: NUMA aware allocation for zswap_dstmem zswap_dstmem is a percpu block of memory, which should be allocated using kmalloc_node(), to get better NUMA locality. Without it, all the blocks are allocated from a single node. Signed-off-by: Eric Dumazet Acked-by: Seth Jennings Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index aeaef0fb5624..008388fe7b0f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -347,7 +347,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) return NOTIFY_BAD; } *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; - dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) { pr_err("can't allocate compressor buffer\n"); crypto_free_comp(tfm); -- cgit v1.2.3 From 71abdc15adf8c702a1dd535f8e30df50758848d2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 6 Jun 2014 14:35:35 -0700 Subject: mm: vmscan: clear kswapd's special reclaim powers before exiting When kswapd exits, it can end up taking locks that were previously held by allocating tasks while they waited for reclaim. Lockdep currently warns about this: On Wed, May 28, 2014 at 06:06:34PM +0800, Gu Zheng wrote: > inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-R} usage. > kswapd2/1151 [HC0[0]:SC0[0]:HE1:SE1] takes: > (&sig->group_rwsem){+++++?}, at: exit_signals+0x24/0x130 > {RECLAIM_FS-ON-W} state was registered at: > mark_held_locks+0xb9/0x140 > lockdep_trace_alloc+0x7a/0xe0 > kmem_cache_alloc_trace+0x37/0x240 > flex_array_alloc+0x99/0x1a0 > cgroup_attach_task+0x63/0x430 > attach_task_by_pid+0x210/0x280 > cgroup_procs_write+0x16/0x20 > cgroup_file_write+0x120/0x2c0 > vfs_write+0xc0/0x1f0 > SyS_write+0x4c/0xa0 > tracesys+0xdd/0xe2 > irq event stamp: 49 > hardirqs last enabled at (49): _raw_spin_unlock_irqrestore+0x36/0x70 > hardirqs last disabled at (48): _raw_spin_lock_irqsave+0x2b/0xa0 > softirqs last enabled at (0): copy_process.part.24+0x627/0x15f0 > softirqs last disabled at (0): (null) > > other info that might help us debug this: > Possible unsafe locking scenario: > > CPU0 > ---- > lock(&sig->group_rwsem); > > lock(&sig->group_rwsem); > > *** DEADLOCK *** > > no locks held by kswapd2/1151. > > stack backtrace: > CPU: 30 PID: 1151 Comm: kswapd2 Not tainted 3.10.39+ #4 > Call Trace: > dump_stack+0x19/0x1b > print_usage_bug+0x1f7/0x208 > mark_lock+0x21d/0x2a0 > __lock_acquire+0x52a/0xb60 > lock_acquire+0xa2/0x140 > down_read+0x51/0xa0 > exit_signals+0x24/0x130 > do_exit+0xb5/0xa50 > kthread+0xdb/0x100 > ret_from_fork+0x7c/0xb0 This is because the kswapd thread is still marked as a reclaimer at the time of exit. But because it is exiting, nobody is actually waiting on it to make reclaim progress anymore, and it's nothing but a regular thread at this point. Be tidy and strip it of all its powers (PF_MEMALLOC, PF_SWAPWRITE, PF_KSWAPD, and the lockdep reclaim state) before returning from the thread function. Signed-off-by: Johannes Weiner Reported-by: Gu Zheng Cc: Yasuaki Ishimatsu Cc: Tang Chen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 9149444f947d..05d41c0d7f6c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3372,7 +3372,10 @@ static int kswapd(void *p) } } + tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + return 0; } -- cgit v1.2.3 From 844e4d66f4ec3b6b6d3bcfcfba3ade2b962771e2 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 6 Jun 2014 14:35:36 -0700 Subject: slub: search partial list on numa_mem_id(), instead of numa_node_id() Currently, if allocation constraint to node is NUMA_NO_NODE, we search a partial slab on numa_node_id() node. This doesn't work properly on a system having memoryless nodes, since it can have no memory on that node so there must be no partial slab on that node. On that node, page allocation always falls back to numa_mem_id() first. So searching a partial slab on numa_node_id() in that case is the proper solution for the memoryless node case. Signed-off-by: Joonsoo Kim Acked-by: Nishanth Aravamudan Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Wanpeng Li Cc: Han Pingtian Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index fdf0fe4da9a9..b2b047327d76 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1726,7 +1726,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_cpu *c) { void *object; - int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; + int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) -- cgit v1.2.3 From cccad5b983d2b0aa453879591ac4ab1c54ff9db6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Jun 2014 14:38:09 -0700 Subject: mm: convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- mm/page_alloc.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 533fa60c9ac1..7d9a4ef0a078 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1664,7 +1664,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(ctl_table *table, int write, +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a59bdb653958..4f59fa29eda8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3389,7 +3389,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); /* * sysctl handler for numa_zonelist_order */ -int numa_zonelist_order_handler(ctl_table *table, int write, +int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5805,7 +5805,7 @@ module_init(init_per_zone_wmark_min) * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; @@ -5822,7 +5822,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, } #ifdef CONFIG_NUMA -int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5838,7 +5838,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return 0; } -int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; @@ -5864,7 +5864,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -5877,7 +5877,7 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ -int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; -- cgit v1.2.3 From 688eb988d15af55c1d1b70b1ca9f6ce58f277c20 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 6 Jun 2014 14:38:15 -0700 Subject: vmscan: memcg: always use swappiness of the reclaimed memcg Memory reclaim always uses swappiness of the reclaim target memcg (origin of the memory pressure) or vm_swappiness for global memory reclaim. This behavior was consistent (except for difference between global and hard limit reclaim) because swappiness was enforced to be consistent within each memcg hierarchy. After "mm: memcontrol: remove hierarchy restrictions for swappiness and oom_control" each memcg can have its own swappiness independent of hierarchical parents, though, so the consistency guarantee is gone. This can lead to an unexpected behavior. Say that a group is explicitly configured to not swapout by memory.swappiness=0 but its memory gets swapped out anyway when the memory pressure comes from its parent with a It is also unexpected that the knob is meaningless without setting the hard limit which would trigger the reclaim and enforce the swappiness. There are setups where the hard limit is configured higher in the hierarchy by an administrator and children groups are under control of somebody else who is interested in the swapout behavior but not necessarily about the memory limit. From a semantic point of view swappiness is an attribute defining anon vs. file proportional scanning of LRU which is memcg specific (unlike charges which are propagated up the hierarchy) so it should be applied to the particular memcg's LRU regardless where the memory pressure comes from. This patch removes vmscan_swappiness() and stores the swappiness into the scan_control structure. mem_cgroup_swappiness is then used to provide the correct value before shrink_lruvec is called. The global vm_swappiness is used for the root memcg. [hughd@google.com: oopses immediately when booted with cgroup_disable=memory] Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- mm/vmscan.c | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a500cb0594c4..9bf8a84bcaae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1550,7 +1550,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* root ? */ - if (!css_parent(&memcg->css)) + if (mem_cgroup_disabled() || !css_parent(&memcg->css)) return vm_swappiness; return memcg->swappiness; diff --git a/mm/vmscan.c b/mm/vmscan.c index 05d41c0d7f6c..f44476a41544 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -83,6 +83,9 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; + /* anon vs. file LRUs scanning "ratio" */ + int swappiness; + /* * The memory cgroup that hit its limit and as a result is the * primary target of this reclaim invocation. @@ -1845,13 +1848,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct scan_control *sc) -{ - if (global_reclaim(sc)) - return vm_swappiness; - return mem_cgroup_swappiness(sc->target_mem_cgroup); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -1912,7 +1908,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + if (!global_reclaim(sc) && !sc->swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1922,7 +1918,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && vmscan_swappiness(sc)) { + if (!sc->priority && sc->swappiness) { scan_balance = SCAN_EQUAL; goto out; } @@ -1965,7 +1961,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); + anon_prio = sc->swappiness; file_prio = 200 - anon_prio; /* @@ -2265,6 +2261,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) lruvec = mem_cgroup_zone_lruvec(zone, memcg); + sc->swappiness = mem_cgroup_swappiness(memcg); shrink_lruvec(lruvec, sc); /* @@ -2731,6 +2728,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_swap = !noswap, .order = 0, .priority = 0, + .swappiness = mem_cgroup_swappiness(memcg), .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); -- cgit v1.2.3 From aae0ad7ae5a997bff1862e6e5eebed39cdf08ff3 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Fri, 6 Jun 2014 14:38:16 -0700 Subject: mm/kmemleak.c: use %u to print ->checksum Signed-off-by: Jianpeng Ma Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 736ade31d1dc..5d4aec44982e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -387,7 +387,7 @@ static void dump_object_info(struct kmemleak_object *object) pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%lx\n", object->flags); - pr_notice(" checksum = %d\n", object->checksum); + pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } -- cgit v1.2.3 From ffe2c748e283c5dc1b9b9ac116299dbfc11a609b Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 6 Jun 2014 14:38:17 -0700 Subject: mm: introduce kmemleak_update_trace() The memory allocation stack trace is not always useful for debugging a memory leak (e.g. radix_tree_preload). This function, when called, updates the stack trace for an already allocated object. Signed-off-by: Catalin Marinas Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5d4aec44982e..3cda50c1e394 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -989,6 +989,40 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); +/** + * kmemleak_update_trace - update object allocation stack trace + * @ptr: pointer to beginning of the object + * + * Override the object allocation stack trace for cases where the actual + * allocation place is not always useful. + */ +void __ref kmemleak_update_trace(const void *ptr) +{ + struct kmemleak_object *object; + unsigned long flags; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr)) + return; + + object = find_and_get_object((unsigned long)ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Updating stack trace for unknown object at %p\n", + ptr); +#endif + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->trace_len = __save_stack_trace(object->trace); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); +} +EXPORT_SYMBOL(kmemleak_update_trace); + /** * kmemleak_not_leak - mark an allocated object as false positive * @ptr: pointer to beginning of the object -- cgit v1.2.3 From 174119628188b085c66fe7d86fbfb4cccb1bd864 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 6 Jun 2014 14:38:19 -0700 Subject: mm/mempool.c: update the kmemleak stack trace for mempool allocations When mempool_alloc() returns an existing pool object, kmemleak_alloc() is no longer called and the stack trace corresponds to the original object allocation. This patch updates the kmemleak allocation stack trace for such objects to make it more useful for debugging. Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index 455d468c3a5d..e209c98c7203 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -222,6 +223,11 @@ repeat_alloc: spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); + /* + * Update the allocation stack trace as this is more useful + * for debugging. + */ + kmemleak_update_trace(element); return element; } -- cgit v1.2.3 From aedf95ea0583676cd7bfa395681ad744791a433e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 6 Jun 2014 14:38:20 -0700 Subject: mm/memblock.c: call kmemleak directly from memblock_(alloc|free) Kmemleak could ignore memory blocks allocated via memblock_alloc() leading to false positives during scanning. This patch adds the corresponding callbacks and removes kmemleak_free_* calls in mm/nobootmem.c to avoid duplication. The kmemleak_alloc() in mm/nobootmem.c is kept since __alloc_memory_core_early() does not use memblock_alloc() directly. Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 10 ++++++++-- mm/nobootmem.c | 2 -- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 0aa0d2b07624..6d2f219a48b0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -691,6 +691,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) (unsigned long long)base + size - 1, (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); return memblock_remove_range(&memblock.reserved, base, size); } @@ -1043,9 +1044,14 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, align = SMP_CACHE_BYTES; found = memblock_find_in_range_node(size, align, start, end, nid); - if (found && !memblock_reserve(found, size)) + if (found && !memblock_reserve(found, size)) { + /* + * The min_count is set to 0 so that memblock allocations are + * never reported as leaks. + */ + kmemleak_alloc(__va(found), size, 0, 0); return found; - + } return 0; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 04a9d94333a5..7ed58602e71b 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -197,7 +197,6 @@ unsigned long __init free_all_bootmem(void) void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { - kmemleak_free_part(__va(physaddr), size); memblock_free(physaddr, size); } @@ -212,7 +211,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { - kmemleak_free_part(__va(addr), size); memblock_free(addr, size); } -- cgit v1.2.3 From e231875ba7a118de7970fae3ac08b244a2822074 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 6 Jun 2014 14:38:20 -0700 Subject: mm: memcontrol: clean up memcg zoneinfo lookup Memcg zoneinfo lookup sites have either the page, the zone, or the node id and zone index, but sites that only have the zone have to look up the node id and zone index themselves, whereas sites that already have those two integers use a function for a simple pointer chase. Provide mem_cgroup_zone_zoneinfo() that takes a zone pointer and let sites that already have node id and zone index - all for each node, for each zone iterators - use &memcg->nodeinfo[nid]->zoneinfo[zid]. Rename page_cgroup_zoneinfo() to mem_cgroup_page_zoneinfo() to match. Signed-off-by: Jianyu Zhan Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 89 +++++++++++++++++++++++++-------------------------------- 1 file changed, 39 insertions(+), 50 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9bf8a84bcaae..41c1b393fef5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -676,9 +676,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg) static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { - VM_BUG_ON((unsigned)nid >= nr_node_ids); + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } @@ -688,12 +690,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(memcg, nid, zid); + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } static struct mem_cgroup_tree_per_zone * @@ -772,16 +774,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) unsigned long long excess; struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree_from_page(page); /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = mem_cgroup_page_zoneinfo(memcg, page); excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or @@ -804,14 +804,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - int node, zone; - struct mem_cgroup_per_zone *mz; struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_zone *mz; + int nid, zid; - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + mctz = soft_limit_tree_node_zone(nid, zid); mem_cgroup_remove_exceeded(memcg, mz, mctz); } } @@ -946,8 +946,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_zone *mz; @@ -955,46 +954,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, - unsigned int lru_mask) -{ - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - unsigned long ret = 0; - - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - - for_each_lru(lru) { - if (BIT(lru) & lru_mask) - ret += mz->lru_size[lru]; - } - return ret; -} - -static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, + unsigned int lru_mask) { - u64 total = 0; + unsigned long nr = 0; int zid; - for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(memcg, - nid, zid, lru_mask); + VM_BUG_ON((unsigned)nid >= nr_node_ids); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct mem_cgroup_per_zone *mz; + enum lru_list lru; - return total; + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + nr += mz->lru_size[lru]; + } + } + return nr; } static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { + unsigned long nr = 0; int nid; - u64 total = 0; for_each_node_state(nid, N_MEMORY) - total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); - return total; + nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + return nr; } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -1242,11 +1233,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, int uninitialized_var(seq); if (reclaim) { - int nid = zone_to_nid(reclaim->zone); - int zid = zone_idx(reclaim->zone); struct mem_cgroup_per_zone *mz; - mz = mem_cgroup_zoneinfo(root, nid, zid); + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); iter = &mz->reclaim_iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) { iter->last_visited = NULL; @@ -1353,7 +1342,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, goto out; } - mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); + mz = mem_cgroup_zone_zoneinfo(memcg, zone); lruvec = &mz->lruvec; out: /* @@ -1412,7 +1401,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) pc->mem_cgroup = memcg = root_mem_cgroup; - mz = page_cgroup_zoneinfo(memcg, page); + mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -5305,7 +5294,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; rstat = &mz->lruvec.reclaim_stat; recent_rotated[0] += rstat->recent_rotated[0]; -- cgit v1.2.3 From cf2c81279eb0f66d382c0e889c266f8a57785cc4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 6 Jun 2014 14:38:21 -0700 Subject: mm: memcontrol: remove unnecessary memcg argument from soft limit functions Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Jianyu Zhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 41c1b393fef5..a9559b91603c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -713,11 +713,9 @@ soft_limit_tree_from_page(struct page *page) return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; } -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; @@ -747,10 +745,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, mz->on_tree = true; } -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { if (!mz->on_tree) return; @@ -758,13 +754,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, mz->on_tree = false; } -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) { spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); spin_unlock(&mctz->lock); } @@ -791,12 +785,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) spin_lock(&mctz->lock); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); } } @@ -812,7 +806,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; mctz = soft_limit_tree_node_zone(nid, zid); - mem_cgroup_remove_exceeded(memcg, mz, mctz); + mem_cgroup_remove_exceeded(mz, mctz); } } } @@ -835,7 +829,7 @@ retry: * we will to add it back at the end of reclaim to its correct * position in the tree. */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); if (!res_counter_soft_limit_excess(&mz->memcg->res) || !css_tryget(&mz->memcg->css)) goto retry; @@ -4586,7 +4580,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; } while (1); } - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); + __mem_cgroup_remove_exceeded(mz, mctz); excess = res_counter_soft_limit_excess(&mz->memcg->res); /* * One school of thought says that we should not add @@ -4597,7 +4591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * term TODO. */ /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); + __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock(&mctz->lock); css_put(&mz->memcg->css); loop++; -- cgit v1.2.3 From 33041a0d76d3c3e0aff28ac95a2ffdedf1282dbc Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 6 Jun 2014 14:38:23 -0700 Subject: mm: mark remap_file_pages() syscall as deprecated The remap_file_pages() system call is used to create a nonlinear mapping, that is, a mapping in which the pages of the file are mapped into a nonsequential order in memory. The advantage of using remap_file_pages() over using repeated calls to mmap(2) is that the former approach does not require the kernel to create additional VMA (Virtual Memory Area) data structures. Supporting of nonlinear mapping requires significant amount of non-trivial code in kernel virtual memory subsystem including hot paths. Also to get nonlinear mapping work kernel need a way to distinguish normal page table entries from entries with file offset (pte_file). Kernel reserves flag in PTE for this purpose. PTE flags are scarce resource especially on some CPU architectures. It would be nice to free up the flag for other usage. Fortunately, there are not many users of remap_file_pages() in the wild. It's only known that one enterprise RDBMS implementation uses the syscall on 32-bit systems to map files bigger than can linearly fit into 32-bit virtual address space. This use-case is not critical anymore since 64-bit systems are widely available. The plan is to deprecate the syscall and replace it with an emulation. The emulation will create new VMAs instead of nonlinear mappings. It's going to work slower for rare users of remap_file_pages() but ABI is preserved. One side effect of emulation (apart from performance) is that user can hit vm.max_map_count limit more easily due to additional VMAs. See comment for DEFAULT_MAX_MAP_COUNT for more details on the limit. [akpm@linux-foundation.org: fix spello] Signed-off-by: Kirill A. Shutemov Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Jones Cc: Armin Rigo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 2c5646f11f41..72b8fa361433 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -149,6 +149,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, int has_write_lock = 0; vm_flags_t vm_flags = 0; + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + if (prot) return err; /* -- cgit v1.2.3 From f59428ab73ce83adf801d86787c450cef4d9fff9 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 6 Jun 2014 14:38:26 -0700 Subject: mm/kmemleak-test.c: use pr_fmt for logging Signed-off-by: Fabian Frederick Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak-test.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index ff0d9779cec8..dcdcadb69533 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -18,6 +18,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "kmemleak: " fmt + #include #include #include @@ -50,25 +52,25 @@ static int __init kmemleak_test_init(void) printk(KERN_INFO "Kmemleak testing\n"); /* make some orphan objects */ - pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); - pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); #ifndef CONFIG_MODULES - pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = %p\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); - pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = %p\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); #endif - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); - pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); /* * Add elements to a list. They should only appear as orphan @@ -76,7 +78,7 @@ static int __init kmemleak_test_init(void) */ for (i = 0; i < 10; i++) { elem = kzalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); + pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; INIT_LIST_HEAD(&elem->list); @@ -85,7 +87,7 @@ static int __init kmemleak_test_init(void) for_each_possible_cpu(i) { per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); - pr_info("kmemleak: kmalloc(129) = %p\n", + pr_info("kmalloc(129) = %p\n", per_cpu(kmemleak_test_pointer, i)); } -- cgit v1.2.3 From b1de0d139c97a6078bbada6cf2d27c30ce127a97 Mon Sep 17 00:00:00 2001 From: Mitchel Humpherys Date: Fri, 6 Jun 2014 14:38:30 -0700 Subject: mm: convert some level-less printks to pr_* printk is meant to be used with an associated log level. There are some instances of printk scattered around the mm code where the log level is missing. Add a log level and adhere to suggestions by scripts/checkpatch.pl by moving to the pr_* macros. Also add the typical pr_fmt definition so that print statements can be easily traced back to the modules where they occur, correlated one with another, etc. This will require the removal of some (now redundant) prefixes on a few print statements. Signed-off-by: Mitchel Humpherys Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 ++++- mm/mmap.c | 21 ++++++++++++--------- mm/nommu.c | 5 ++++- mm/vmscan.c | 5 ++++- 4 files changed, 24 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 16bc9fa42998..1c16c228f35a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -65,6 +65,8 @@ kernel is not always grateful with that. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -91,6 +93,7 @@ #include #include #include +#include #include #include @@ -2645,7 +2648,7 @@ void __init numa_policy_init(void) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) - printk("numa_policy_init: interleaving failed\n"); + pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } diff --git a/mm/mmap.c b/mm/mmap.c index ced5efcdd4b6..129b847d30cc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -6,6 +6,8 @@ * Address space accounting code */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -37,6 +39,7 @@ #include #include #include +#include #include #include @@ -361,20 +364,20 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - printk("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - printk("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - printk("vm_end %lx < vm_start %lx\n", + pr_info("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - printk("free gap %lx, correct %lx\n", + pr_info("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -388,7 +391,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - printk("backwards %d, forwards %d\n", j, i); + pr_info("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -423,17 +426,17 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - printk("map_count %d vm_next %d\n", mm->map_count, i); + pr_info("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - printk("mm->highest_vm_end %lx, found %lx\n", + pr_info("mm->highest_vm_end %lx, found %lx\n", mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - printk("map_count %d rb %d\n", mm->map_count, i); + pr_info("map_count %d rb %d\n", mm->map_count, i); bug = 1; } BUG_ON(bug); @@ -3280,7 +3283,7 @@ static struct notifier_block reserve_mem_nb = { static int __meminit init_reserve_notifier(void) { if (register_hotmemory_notifier(&reserve_mem_nb)) - printk("Failed registering memory add/remove notifier for admin reserve"); + pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } diff --git a/mm/nommu.c b/mm/nommu.c index 85f8d6698d48..b78e3a8f5ee7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -13,6 +13,8 @@ * Copyright (c) 2007-2010 Paul Mundt */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -32,6 +34,7 @@ #include #include #include +#include #include #include @@ -1246,7 +1249,7 @@ error_free: return ret; enomem: - printk("Allocation of length %lu from process %d (%s) failed\n", + pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); show_free_areas(0); return -ENOMEM; diff --git a/mm/vmscan.c b/mm/vmscan.c index f44476a41544..71f23c0c1090 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -43,6 +45,7 @@ #include #include #include +#include #include #include @@ -480,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __func__); + pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } -- cgit v1.2.3